It is similar to the more basic Levenshtein distance but the Jaro distance also accounts for transpositions between letters in the words.
With the Winkler modification to the Jaro metric, the Jaro-Winkler distance also adds an increase in similarity for words which start with the same letters (prefix).
More details you can find here:
https://en.wikipedia.org/wiki/Jaro–Winkler_distance
My attempt is was based on a conversion of the code from FreeBASIC published in Rosetta Code
https://rosettacode.org/wiki/Jaro-Winkler_distance#Go
You can downloand the necessary (for the test purpose) dictionary file here:
http://wiki.puzzlers.org/pub/wordlists/unixdict.txt
Best regards,
Thanos
Code: Select all
EnableExplicit
;~ Δομή για την αποθήκευση λέξεων και αποστάσεων
Structure WordDist
sWord.s
dDist.d
EndStructure
;~ Συνάρτηση Jaro Similarity
Procedure.d jaroSim(sStr1.s, sStr2.s)
Protected i
Protected k
Protected Initial
Protected Final
Protected MatchDistance
Protected dMatches.d = 0
Protected dTranspositions.d = 0
Protected Dim bMatchesOfsStr1.b(0)
Protected Dim bMatchesOfsStr2.b(0)
If Len(sStr1) = 0 And Len(sStr2) = 0
ProcedureReturn 1.0
EndIf
If Len(sStr1) = 0 Or Len(sStr2) = 0
ProcedureReturn 0.0
EndIf
If Len(sStr1) > Len(sStr2)
MatchDistance = Len(sStr1)
Else
MatchDistance = Len(sStr2)
EndIf
MatchDistance = MatchDistance / 2 - 1
ReDim bMatchesOfsStr1(Len(sStr1))
ReDim bMatchesOfsStr2(Len(sStr2))
For i = 0 To Len(sStr1) - 1
Initial = i - MatchDistance
If Initial < 0
Initial = 0
EndIf
Final = i + MatchDistance + 1
If Final > Len(sStr2) - 1
Final = Len(sStr2) - 1
EndIf
For k = Initial To Final
If bMatchesOfsStr2(k)
Continue
EndIf
If Mid(sStr1, i + 1, 1) <> Mid(sStr2, k + 1, 1)
Continue
EndIf
bMatchesOfsStr1(i) = #True
bMatchesOfsStr2(k) = #True
dMatches + 1
Break
Next k
Next i
If dMatches = 0
ProcedureReturn 0
EndIf
k = 0
For i = 0 To Len(sStr1) - 1
If Not bMatchesOfsStr1(i)
Continue
EndIf
While Not bMatchesOfsStr2(k)
k + 1
Wend
If Mid(sStr1, i + 1, 1) <> Mid(sStr2, k + 1, 1)
dTranspositions + 1
EndIf
k + 1
Next i
dTranspositions / 2
ProcedureReturn (dMatches / Len(sStr1) + dMatches / Len(sStr2) + (dMatches - dTranspositions) / dMatches) / 3
EndProcedure
; Συνάρτηση Jaro-Winkler Distance
Procedure.d jaroWinklerDist(sStr1.s, sStr2.s)
Protected i
Protected length
Protected lenTxt1 = Len(sStr1)
Protected lenTxt2 = Len(sStr2)
Protected lenMax
Protected dJSim.d
Protected p.d
Protected dJWD.d
If (lenTxt1 < lenTxt2)
lenMax = lenTxt1
Else
lenMax = lenTxt2
EndIf
If (lenMax > 4): lenMax = 4: EndIf
length = 0
For i = 0 To lenMax - 1
If Mid(sStr1, i + 1, 1) = Mid(sStr2, i + 1, 1)
length + 1
EndIf
Next i
dJSim = jaroSim(sStr1, sStr2)
p.d = 0.1
dJWD = dJSim + length * p * (1 - dJSim)
ProcedureReturn 1 - dJWD
EndProcedure
;~ Main()
OpenConsole()
;~ Κύριο πρόγραμμα
Dim arMisspelt.s(8)
arMisspelt(0) = "accomodate"
arMisspelt(1) = "definately"
arMisspelt(2) = "goverment"
arMisspelt(3) = "occured"
arMisspelt(4) = "publically"
arMisspelt(5) = "recieve"
arMisspelt(6) = "seperate"
arMisspelt(7) = "untill"
arMisspelt(8) = "wich"
;~ Διάβασμα του αρχείου λεξικού
Define sWord.s
Define wordsFile.s = "unixdict.txt"
Define wordsCnt = 0
Define hFile
Define minDist
Define temp.WordDist
Dim arWords.s(1000)
hFile = ReadFile(#PB_Any, wordsFile)
If (hFile)
While Not Eof(hFile)
sWord = ReadString(hFile)
If Len(sWord) > 0
arWords(wordsCnt) = sWord
wordsCnt + 1
If (wordsCnt % 1000) = 0
ReDim arWords(wordsCnt + 1000)
EndIf
EndIf
Wend
CloseFile(hFile)
Define i, j, k
For i = 0 To 8
Dim arClosest.WordDist(100)
Define closestCnt = 0
Define sMisspell.s = arMisspelt(i)
PrintN("Misspelt word: " + sMisspell + ":")
For j = 0 To wordsCnt - 1
If Len(arWords(j)) = 0
Continue
EndIf
Define jwd.d = jaroWinklerDist(sMisspell, arWords(j))
If jwd < 0.15
arClosest(closestCnt)\sWord = arWords(j)
arClosest(closestCnt)\dDist = jwd
closestCnt + 1
EndIf
Next j
; Απλός bubble sort για τις πιο κοντινές αντιστοιχίες
For j = 0 To closestCnt - 2
For k = 0 To closestCnt - 2 - j
If arClosest(k)\dDist > arClosest(k + 1)\dDist
; Swap arClosest(k), arClosest(k + 1)
temp.WordDist
temp = arClosest(k)
arClosest(k) = arClosest(k + 1)
arClosest(k + 1) = temp
EndIf
Next k
Next j
; Εκτύπωση των 6 κορυφαίων αντιστοιχιών
Define minDist = 5
If 5 > (closestCnt - 1)
minDist = (closestCnt - 1)
EndIf
For j = 0 To minDist
PrintN(StrD(arClosest(j)\dDist, 4) + " " + arClosest(j)\sWord)
Next j
PrintN("")
Next i
Else
PrintN("Cannot open file " + wordsFile + ".")
EndIf
Input() ;~ Αναμονή για πληκτρολόγηση πριν το κλείσιμο