So, hab nun aus folgender Quelle einen Code geschrieben:
Unicode® Technical Standard #10 - UNICODE COLLATION ALGORITHM
Der Code lädt (wenn nicht vorhanden) die Hash-Table von der Website runter, kann daher am Anfang etwas dauern.
Ansonsten bin ich mit dem Resultat sehr zufrieden, wobei ich die Geschwindigkeit noch nicht getestet habe.
Code: Alles auswählen
Structure CollationTable
Array Key.q(0)
EndStructure
Structure CharacterArray
c.c[0]
EndStructure
Structure UnicodeArray
u.u[0]
EndStructure
Global Dim CollationTable.CollationTable($FFFF)
Procedure ImportTable()
; References: https://www.unicode.org/Public/UCA/latest/allkeys.txt
Protected File.i
Protected Line.s
Protected UnicodeNumber.i = CreateRegularExpression(#PB_Any, "^[\dABCDEFabcdef]+(?=\s*;)")
Protected CollationNumber.i = CreateRegularExpression(#PB_Any, "\[[*.](\w{4})\.(\w{4})\.(\w{4})\]")
Protected Unicode.i, I.i, Key.q
Protected Dim Extract.s(0)
Protected Result.i
If FileSize(GetTemporaryDirectory()+"UCA_latest_allkeys.txt") > 0 Or ReceiveHTTPFile("https://www.unicode.org/Public/UCA/latest/allkeys.txt", GetTemporaryDirectory()+"UCA_latest_allkeys.txt")
File= ReadFile(#PB_Any, GetTemporaryDirectory()+"UCA_latest_allkeys.txt")
If File
While Not Eof(File)
Line.s = ReadString(File, #PB_UTF8)
If ExtractRegularExpression(UnicodeNumber, Line, Extract())
Unicode = Val("$"+Extract(0))
If Unicode <= $FFFF And ExamineRegularExpression(CollationNumber, Line)
;Debug "Unicode: "+Str(Unicode)
I = 0
While NextRegularExpressionMatch(CollationNumber)
ReDim CollationTable(Unicode)\Key(I)
Key.q = Val("$" + RegularExpressionGroup(CollationNumber, 1) + RegularExpressionGroup(CollationNumber, 2) + RegularExpressionGroup(CollationNumber, 3))
CollationTable(Unicode)\Key(I) = Key
I + 1
Wend
EndIf
EndIf
Wend
CloseFile(File)
Result = #True
EndIf
EndIf
FreeRegularExpression(UnicodeNumber)
FreeRegularExpression(CollationNumber)
ProcedureReturn Result
EndProcedure
Procedure StringCollation_StringKey(*Char.CharacterArray)
; References: http://www.unicode.org/reports/tr10/#Main_Algorithm
Protected I.i, J.i, Length.i, Level.i, Key.q, Max.i
Protected *CollationTable.CollationTable
Protected *Buffer.UnicodeArray, N.i
I = 0
While *Char\c[I]
*CollationTable = CollationTable(*Char\c[I])
Length + (ArraySize(*CollationTable\Key())+1)
I + 1
Wend
*Buffer = AllocateMemory((Length*3+2)*2)
N = 0
For Level = 1 To 3
I = 0
If Level <> 1
*Buffer\u[N] = 0
N + 1
EndIf
While *Char\c[I]
*CollationTable = CollationTable(*Char\c[I])
Max = ArraySize(*CollationTable\Key())
For J = 0 To Max
Key = *CollationTable\Key(J)>>(48-Level*16) & $FFFF
If Key <> 0
*Buffer\u[N] = Key
N + 1
EndIf
Next
I + 1
Wend
Next
*Buffer = ReAllocateMemory(*Buffer, N*2)
;ShowMemoryViewer(*Buffer, N*2)
ProcedureReturn *Buffer
EndProcedure
Procedure StringCollation(String1.s, String2.s)
Protected *Key1.UnicodeArray, *Key2.UnicodeArray
Protected I.i, Max.i, Result.i = 0
*Key1.UnicodeArray = StringCollation_StringKey(@String1)
*Key2.UnicodeArray = StringCollation_StringKey(@String2)
If MemorySize(*Key2) < MemorySize(*Key1)
Max = MemorySize(*Key2)>>1 - 1
Else
Max = MemorySize(*Key1)>>1 - 1
EndIf
For I = 0 To Max
;Debug Hex(*Key1\u[I]) + " :: " + Hex(*Key2\u[I])
If *Key1\u[I] > *Key2\u[I]
Result = -1
Break
ElseIf *Key1\u[I] < *Key2\u[I]
Result = 1
Break
EndIf
Next
FreeMemory(*Key1)
FreeMemory(*Key2)
ProcedureReturn Result
EndProcedure
;- Example
Procedure BubblesortSort(List Word.s())
Protected J.i, I.i, Word1.s, Word2.s, *ID1, *ID2
For J = ListSize(Word()) To 1 Step -1
For I = 2 To ListSize(Word())
*ID1 = SelectElement(Word(), I-2)
Word1 = Word()
*ID2 = SelectElement(Word(), I-1)
Word2 = Word()
If StringCollation(Word1, Word2) = -1
SwapElements(Word(), *ID1, *ID2)
EndIf
Next
Next
EndProcedure
InitNetwork()
Define NewList Word.s()
AddElement(Word()) : Word() = "dab"
AddElement(Word()) : Word() = "cáb"
AddElement(Word()) : Word() = "cab"
AddElement(Word()) : Word() = "Cab"
AddElement(Word()) : Word() = "resume"
AddElement(Word()) : Word() = "résumé"
AddElement(Word()) : Word() = "RÉSUMÉ"
AddElement(Word()) : Word() = "RESUME"
AddElement(Word()) : Word() = "Resume"
AddElement(Word()) : Word() = "Résumé"
AddElement(Word()) : Word() = "αλΦα"
AddElement(Word()) : Word() = "άλφα"
AddElement(Word()) : Word() = "αλφα"
AddElement(Word()) : Word() = "άΛφα"
If ImportTable()
BubblesortSort(Word())
ForEach Word()
Debug Word()
Next
EndIf
Edit: Die Hash-Table kann man natürlich rückwirkend in binärer Form speichern.
________________________________________________________________________________________________
Edit2: Hier nun sowohl die binäre Variante als auch eine (vermutlich) schnellere Variante, da ich jetzt keinen Zusatzspeicher anlege:
Der eigentliche Code:
Code: Alles auswählen
CompilerIf Defined(CharacterArray, #PB_Structure) = #False
Structure CharacterArray
c.c[0]
EndStructure
CompilerEndIf
CompilerIf Defined(QuadArray, #PB_Structure) = #False
Structure QuadArray
q.q[0]
EndStructure
CompilerEndIf
Procedure UCA_StringCollation(*String1.CharacterArray, *String2.CharacterArray)
Protected Character1.i, Character2.i ; Character position
Protected Key1.i, Key2.i ; Key position in the character
Protected Level1.i, Level2.i ; Level position in the key
Protected Number1.i, Number2.i ; Ordnungsnummer
Protected *KeyList1.QuadArray, *KeyList2.QuadArray
Repeat
Repeat
*KeyList1 = ?UCA_KeyList + SizeOf(Quad)*PeekL(?UCA_HashTable + *String1\c[Character1]*SizeOf(Long))
Number1 = *KeyList1\q[Key1] >> (Level1*16) & $FFFF
If Number1 = 0
If *String1\c[Character1] = #NUL
Break
ElseIf Key1 < *KeyList1\q[0]>>48-1
Key1 + 1
Else
Key1 = 0
Character1 + 1
EndIf
EndIf
Until Number1
Repeat
*KeyList2 = ?UCA_KeyList + SizeOf(Quad)*PeekL(?UCA_HashTable + *String2\c[Character2]*SizeOf(Long))
Number2 = *KeyList2\q[Key2] >> (Level2*16) & $FFFF
If Number2 = 0
If *String2\c[Character2] = #NUL
Break
ElseIf Key2 < *KeyList2\q[0]>>48-1
Key2 + 1
Else
Key2 = 0
Character2 + 1
EndIf
EndIf
Until Number2
;Debug "Level "+Level1+" | Char "+Character1+" | Key "+Hex(Number1) + " :: " + "Level "+Level2+" | Char "+Character2+" | Key "+Hex(Number2)
If Number1 > Number2
ProcedureReturn -1
ElseIf Number1 < Number2
ProcedureReturn 1
EndIf
If *String1\c[Character1] = #NUL
Character1 = 0
Key1 = 0
Level1 + 1
Else
If Key1 < *KeyList1\q[0]>>48-1
Key1 + 1
Else
Key1 = 0
Character1 + 1
EndIf
EndIf
If *String2\c[Character2] = #NUL
Character2 = 0
Key2= 0
Level2 + 1
Else
If Key2 < *KeyList2\q[0]>>48-1
Key2 + 1
Else
Key2 = 0
Character2 + 1
EndIf
EndIf
Until Level1 = 4 Or Level2 = 4
ProcedureReturn 0
DataSection
UCA_HashTable:
IncludeBinary "UCA_HashTable.bin"
UCA_KeyList:
IncludeBinary "UCA_KeyList.bin"
EndDataSection
EndProcedure
;- Example
Procedure BubblesortSort(List Word.s())
Protected J.i, I.i, Word1.s, Word2.s, *ID1, *ID2
For J = ListSize(Word()) To 1 Step -1
For I = 2 To ListSize(Word())
*ID1 = SelectElement(Word(), I-2)
Word1 = Word()
*ID2 = SelectElement(Word(), I-1)
Word2 = Word()
If UCA_StringCollation(@Word1, @Word2) = -1
SwapElements(Word(), *ID1, *ID2)
EndIf
Next
Next
EndProcedure
Define NewList Word.s()
AddElement(Word()) : Word() = "dab"
AddElement(Word()) : Word() = "cáb"
AddElement(Word()) : Word() = "cab"
AddElement(Word()) : Word() = "Cab"
AddElement(Word()) : Word() = "resume"
AddElement(Word()) : Word() = "résumé"
AddElement(Word()) : Word() = "RÉSUMÉ"
AddElement(Word()) : Word() = "RESUME"
AddElement(Word()) : Word() = "Resume"
AddElement(Word()) : Word() = "Résumé"
AddElement(Word()) : Word() = "αλΦα"
AddElement(Word()) : Word() = "άλφα"
AddElement(Word()) : Word() = "αλφα"
AddElement(Word()) : Word() = "άΛφα"
BubblesortSort(Word())
ForEach Word()
Debug Word()
Next
Und hier der Code zum generieren der beiden Binärdateien für die DataSection:
Code: Alles auswählen
Structure CollationTable
Array Key.q(0)
EndStructure
Global Dim CollationTable.CollationTable($FFFF)
Procedure GenerateTable()
; References: https://www.unicode.org/Public/UCA/latest/allkeys.txt
Protected File.i
Protected Line.s
Protected UnicodeNumber.i = CreateRegularExpression(#PB_Any, "^[\dABCDEFabcdef]+(?=\s*;)")
Protected CollationNumber.i = CreateRegularExpression(#PB_Any, "\[[*.](\w{4})\.(\w{4})\.(\w{4})\]")
Protected Unicode.i, I.i, Key.q
Protected Dim Extract.s(0)
Protected Result.i, Offset.i
; Import
If ReceiveHTTPFile("https://www.unicode.org/Public/UCA/latest/allkeys.txt", GetTemporaryDirectory()+"UCA_latest_allkeys.txt")
File= ReadFile(#PB_Any, GetTemporaryDirectory()+"UCA_latest_allkeys.txt")
If File
While Not Eof(File)
Line.s = ReadString(File, #PB_UTF8)
If ExtractRegularExpression(UnicodeNumber, Line, Extract())
Unicode = Val("$"+Extract(0))
If Unicode <= $FFFF And ExamineRegularExpression(CollationNumber, Line)
;Debug "Unicode: "+Str(Unicode)
I = 0
While NextRegularExpressionMatch(CollationNumber)
ReDim CollationTable(Unicode)\Key(I)
Key.q = Val("$" + RegularExpressionGroup(CollationNumber, 3) + RegularExpressionGroup(CollationNumber, 2) + RegularExpressionGroup(CollationNumber, 1))
CollationTable(Unicode)\Key(I) = Key
I + 1
Wend
EndIf
EndIf
Wend
CloseFile(File)
EndIf
EndIf
FreeRegularExpression(UnicodeNumber)
FreeRegularExpression(CollationNumber)
; Export
File = CreateFile(#PB_Any, "UCA_HashTable.bin")
If File
; Sprungtabelle (Abstand in Quads)
Offset = 0
For Unicode = 0 To $FFFF
CollationTable(Unicode)\Key(0) | (ArraySize(CollationTable(Unicode)\Key())+1)<<48
WriteLong(File, Offset)
Offset + (ArraySize(CollationTable(Unicode)\Key())+1)
Next
CloseFile(File)
EndIf
File = CreateFile(#PB_Any, "UCA_KeyList.bin")
If File
; Daten
For Unicode = 0 To $FFFF
WriteData(File, @CollationTable(Unicode)\Key(), SizeOf(Quad)*(ArraySize(CollationTable(Unicode)\Key())+1))
Next
CloseFile(File)
EndIf
ProcedureReturn Result
EndProcedure
InitNetwork()
GenerateTable()