Ratcliff / Obershelp Pattern Recognition
Algorithm creator(s)
John W. Ratcliff, Obershelp (?), David Metzener (?)
PB author(s)
Mike Tora, Semen Matusovski
Description
Computes the similarity of two strings as the number of matching characters divided by the total number of characters in the two strings. Matching characters are those in the longest common subsequence plus, recursively, matching characters in the unmatched region on either side of the longest common subsequence.
Note
n/a
Source
https://forum.powerbasic.com/forum/user-to-user-discussions/programming/19002-soundex-search-routine-for-pb-dll?p=245578#post245578
See also
n/a
Source Code
Download source code file ratcliff-obershelp.bas (Right-click -> "Save as ...")
#Compile Exe
#Dim All
#Register None
Function Similarity(String1 As String, String2 As String) As Double
#Register None
Dim startstr1 As Long, endstr1 As Long, lenstr1 As Long, startstr2 As Long, endstr2 As Long, lenstr2 As Long
Dim ns1 As Long, ns2 As Long, Maximum As Long, kMaximum As Long, nRef As Long
lenstr1 = Len(String1) - 1
lenstr2 = Len(String2) - 1
startstr1 = StrPtr(String1)
endstr1 = startstr1 + lenstr1
startstr2 = StrPtr(String2)
endstr2 = startstr2 + lenstr2
'--- Preserve registers ---
! PUSH EBX
! PUSH EDI
! PUSH ESI
'--- Put first element to query ---
! PUSH startstr1 ' Address (not index) of first byte in String1
! PUSH endstr1 ' Address of last byte in String1
! PUSH startstr2 ' Address of first byte in String2
! PUSH endstr2 ' Address of last byte in String2
Label001:
' Query is empty ?
! CMP nRef, 0 ' If nRef < 0 Then _
! JL Label009 ' Goto Label009
' Take a combination from query
! POP endstr2
! POP startstr2
! POP endstr1
! POP startstr1
! DEC nRef
! MOV kMaximum, 0 ' kMaximum = 0
! MOV ESI, startstr1 ' c1 (ESI) = startstr1 ' Replacement of
Label002: '
! CMP ESI, endstr1 ' If c1 > endstr1 Then _ '
! JG Label008 ' Goto Label008 ' For c1 = startstr1 To endstr1
! MOV EDI, startstr2 ' c2 (EDI) = startstr1 ' Replacement of
Label003: '
! CMP EDI, endstr2 ' If c2 > endstr2 Then _ '
! JG Label007 ' Goto Label007 ' For c2 = startstr2 To endstr2
! MOV EAX, ESI ' Temporary variables k1 (EAX), k2 (EBX) (are used instead of i)
! MOV EBX, EDI ' k1 = c1: k2 = c2
Label004:
! MOV CL, [EAX] ' Eq. arr1(c1 + i)
! MOV CH, [EBX] ' Eq. arr2(c2 + i)
! CMP CL, CH ' If arr1(c1 + i) <> arr2(c2 + i) Then _
! JNE Label006 ' Goto Label006
! INC EAX ' Incr k1 (Instead of Incr i)
! INC EBX ' Incr k2
! MOV ECX, EAX ' ECX (eq. i) = k1 - c1
! SUB ECX, ESI '
! CMP ECX, kMaximum ' If i <= kMaximum Then _
! JLE Label005 ' Goto Label005
! MOV ns1, ESI ' ns1 = c1
! MOV ns2, EDI ' ns2 = c2
! MOV kMaximum, ECX ' kMaximum = i
Label005:
! CMP EAX, endstr1 ' If c1 > endstr1 Then _
! JG Label006 ' Goto Label006 (Exit Do)
! CMP EBX, endstr2 ' If c2 > endstr2 Then _
! JLE Label004 ' Goto Label004 (Exit Do)
Label006:
! INC EDI ' Incr c2
! JMP Label003 ' Goto Label003 (eq. Next c2)
Label007:
! INC ESI ' Incr c1
! JMP Label002 ' Goto Label003 (eq. Next c1)
Label008:
! CMP kMaximum, 0 ' If kMaximum = 0 Then _
! JE Label001 ' Goto Label001
' Instead of SubStrSim(ns1 + Maximum, endstr1, ns2 + Maximum, endstr2)
! INC nRef ' Next combination in query
! MOV ESI, ns1
! ADD ESI, kMaximum ' ns1 + Maximum
! MOV EDI, ns2
! ADD EDI, kMaximum ' ns2 + Maximum
! PUSH ESI
! PUSH endstr1
! PUSH EDI
! PUSH endstr2
' Instead of SubStrSim(startstr1, ns1 - 1, startstr2, ns2 - 1)
! DEC ns1 ' ns1 - 1
! DEC ns2 ' ns2 - 1
! INC nRef ' Next combination in query
! PUSH startstr1
! PUSH ns1
! PUSH startstr2
! PUSH ns2
! MOV ECX, kMaximum ' Maximum =
! ADD Maximum, ECX ' Maximum + kMaximum
! JMP Label001 ' Continue
' Restore registers
Label009:
! POP ESI
! POP EDI
! POP EBX
Similarity = Maximum / (lenstr1 + lenstr2 + 2) * 2
End Function
Function PbMain
Local simindex As Double, str1 As String, str2 As String, result As String, _
i As Long, start As Single, finish As Single
str1 = UCase$("Borje Hagsten")
str2 = UCase$("Boje Hasgten") ' With errors
start = Timer
For i = 1 To 100000
simindex = Similarity(str1, str2)
Next
finish = Timer
result = Format$(simindex * 100, "##.##")
MsgBox "String Similarity Index = " & result & "%. Time =" & Format$(finish - start, "#,.#####") & " Secs / 100000 calls"
End Function