Implement fuzzy search (Views: 28)
Problem/Question/Abstract: How to implement fuzzy search Answer: Solve 1: This DLL calculates the Levenshtein Distance between two strings. Please note that ShareMem must be the first unit in the Uses clause of the Interface section of your unit, if your DLL exports procedures or functions, which pass string parameters or function results. ShareMem is the interface to delphimm.dll, which you have to distribute together with your own DLL. To avoid using delphimm.dll, pass string parameters by using PChar or ShortString parameters. library Levensh; uses ShareMem, SysUtils; var FiR0: integer; FiP0: integer; FiQ0: integer; function Min(X, Y, Z: Integer): Integer; begin if (X < Y) then Result := X else Result := Y; if (Result > Z) then Result := Z; end; procedure LevenshteinPQR(p, q, r: integer); begin FiP0 := p; FiQ0 := q; FiR0 := r; end; function LevenshteinDistance(const sString, sPattern: string): Integer; const MAX_SIZE = 50; var aiDistance: array[0..MAX_SIZE, 0..MAX_SIZE] of Integer; i, j, iStringLength, iPatternLength, iMaxI, iMaxJ: Integer; chChar: Char; iP, iQ, iR, iPP: Integer; begin iStringLength := length(sString); if (iStringLength > MAX_SIZE) then iMaxI := MAX_SIZE else iMaxI := iStringLength; iPatternLength := length(sPattern); if (iPatternLength > MAX_SIZE) then iMaxJ := MAX_SIZE else iMaxJ := iPatternLength; aiDistance[0, 0] := 0; for i := 1 to iMaxI do aiDistance[i, 0] := aiDistance[i - 1, 0] + FiR0; for j := 1 to iMaxJ do begin chChar := sPattern[j]; if ((chChar = '*') or (chChar = '?')) then iP := 0 else iP := FiP0; if (chChar = '*') then iQ := 0 else iQ := FiQ0; if (chChar = '*') then iR := 0 else iR := FiR0; aiDistance[0, j] := aiDistance[0, j - 1] + iQ; for i := 1 to iMaxI do begin if (sString[i] = sPattern[j]) then iPP := 0 else iPP := iP; {aiDistance[i, j] := Minimum of 3 values} aiDistance[i, j] := Min(aiDistance[i - 1, j - 1] + iPP, aiDistance[i, j - 1] + iQ, aiDistance[i - 1, j] + iR); end; end; Result := aiDistance[iMaxI, iMaxJ]; end; exports LevenshteinDistance Index 1, LevenshteinPQR Index 2; begin FiR0 := 1; FiP0 := 1; FiQ0 := 1; end. Solve 2: This is an old Pascal code snippet, which is based on a C project published in the C't magazine somewhen back in the 1990's. Can't remember where I found it on the WWW. Please note that the code below accesses a simple *.txt file to search in. program FuzzySearch; {Translation from C to Pascal by Karsten Paulini and Simon Reinhardt} const MaxParLen = 255; var InFile: Text; Filename: string; InputStr: string; SearchStr: string; Treshold: Integer; function PrepareTheString(OriginStr: string; var ConvStr: string): Integer; var i: Integer; begin ConvStr := OriginStr; for i := 1 to Length(OriginStr) do begin ConvStr[i] := UpCase(ConvStr[i]); if ConvStr[i] < '0' then ConvStr[i] := ' ' else case ConvStr[i] of Chr(196): ConvStr[i] := Chr(228); Chr(214): ConvStr[i] := Chr(246); Chr(220): ConvStr[i] := Chr(252); Chr(142): ConvStr[i] := Chr(132); Chr(153): ConvStr[i] := Chr(148); Chr(154): ConvStr[i] := Chr(129); ':': ConvStr[i] := ' '; ';': ConvStr[i] := ' '; '<': ConvStr[i] := ' '; '>': ConvStr[i] := ' '; '=': ConvStr[i] := ' '; '?': ConvStr[i] := ' '; '[': ConvStr[i] := ' '; ']': ConvStr[i] := ' '; end; end; PrepareTheString := i; end; function NGramMatch(TextPara, SearchStr: string; SearchStrLen, NGramLen: Integer; var MaxMatch: Integer): Integer; var NGram: string[8]; NGramCount: Integer; i, Count: Integer; begin NGramCount := SearchStrLen - NGramLen + 1; Count := 0; MaxMatch := 0; for i := 1 to NGramCount do begin NGram := Copy(SearchStr, i, NGramLen); if (NGram[NGramLen - 1] = ' ') and (NGram[1] < > ' ') then Inc(i, NGramLen - 3) {will be increased in the loop} else begin Inc(MaxMatch, NGramLen); if Pos(NGram, TextPara) > 0 then Inc(Count); end; end; NGramMatch := Count * NGramLen; end; procedure FuzzyMatching(SearchStr: string; Treshold: Integer; var InFile: Text); var TextPara: string; TextBuffer: string; TextLen: Integer; SearchStrLen: Integer; NGram1Len: Integer; NGram2Len: Integer; MatchCount1: Integer; MatchCount2: Integer; MaxMatch1: Integer; MaxMatch2: Integer; Similarity: Real; BestSim: Real; begin BestSim := 0.0; SearchStrLen := PrepareTheString(SearchStr, SearchStr); NGram1Len := 3; if SearchStrLen < 7 then NGram2Len := 2 else NGram2Len := 5; while not Eof(InFile) do begin Readln(InFile, TextBuffer); TextLen := PrepareTheString(TextBuffer, TextPara) + 1; TextPara := Concat(' ', TextPara); if TextLen < MaxParLen - 2 then begin MatchCount1 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram1Len, MaxMatch1); MatchCount2 := NGramMatch(TextPara, SearchStr, SearchStrLen, NGram2Len, MaxMatch2); Similarity := 100.0 * (MatchCount1 + MatchCount2) / (MaxMatch1 + MaxMatch2); if Similarity > BestSim then BestSim := Similarity; if Similarity >= Treshold then begin Writeln; Writeln('[', Similarity, '] ', TextBuffer); end; end; else Writeln('Paragraph too long'); end; if BestSim < Treshold then Writeln('No match; Best Match was ', BestSim); end; begin Writeln; Writeln('+------------------------------------------+'); Writeln('| Fuzzy Search in Information Retrieval |'); Writeln('| (C) 1997 Reinhard Rapp |'); Writeln('+------------------------------------------+'); Writeln; Write('Name of file to search in: '); Readln(Filename); Write('Search string: '); Readln(InputStr); SearchStr := Concat(' ', InputStr, ' '); Write('Minimum hit quality in % : '); Readln(Treshold); if (Treshold > 0) and (Treshold <= 100) and (SearchStr < > '') and (Filename < > '') then begin Assign(InFile, Filename); Reset(InFile); FuzzyMatching(SearchStr, Treshold, InFile); Close(InFile); end; Writeln; Writeln('Bye!'); end. Solve 3: unit FuzzyMatch; {This unit provides a basic 'fuzzy match' index on how alike two strings are The result is of type 'single': near 0 - poor match near 1 - close match The intention is that HowAlike(s1,s2)=HowAlike(s2,s1) The Function is not case sensitive} interface uses sysutils; function HowAlike(s1, s2: string): single; implementation function instr(start: integer; ToSearch, ToFind: string): integer; begin //This is a quick implementation of the VB InStr, since Pos just doesn't do what is needed!! //NB - case sensitive!! if start > 1 then Delete(ToSearch, 1, start - 1); result := pos(ToFind, ToSearch); if (result > 0) and (start > 1) then inc(result, start); end; function HowAlike(s1, s2: string): single; var l1, l2, pass, position, size, foundpos, maxscore: integer; score, scored, string1pos, string2pos, bestmatchpos: single; swapstring, searchblock: string; begin s1 := Uppercase(trim(s1)); s2 := Uppercase(trim(s2)); score := 0; maxscore := 0; scored := 0; //deal with zero length strings... if (s1 = '') and (s2 = '') then begin result := 1; exit; end else if (s1 = '') or (s2 = '') then begin result := 0; exit; end; //why perform any mathematics is the result is clear? if s1 = s2 then begin result := 1; exit; end; //make two passes, // with s1 and s2 each way round to ensure // consistent results for pass := 1 to 2 do begin l1 := length(s1); l2 := length(s2); for size := l1 downto 1 do begin for position := 1 to (l1 - size + 1) do begin //try to find implied block in the other string //Big blocks score much better than small blocks searchblock := copy(s1, position, size); foundpos := pos(searchblock, s2); if size = l1 then string1pos := 0.5 else string1pos := (position - 1) / (l1 - size); if foundpos > 0 then begin //the string is in somewhere in there // - find the 'closest' one. bestmatchpos := -100; //won't find anything that far away! repeat if size = l2 then string2pos := 0.5 else string2pos := (foundpos - 1) / (l2 - size); //If this closer than the previous best? if abs(string2pos - string1pos) < abs(bestmatchpos - string1pos) then bestmatchpos := string2pos; foundpos := instr(foundpos + 1, s2, searchblock); until foundpos = 0; //loop while foundpos>0.. //The closest position is now known: Score it! //Score as follows: (1-distance of best match) score := score + (1 - abs(string1pos - bestmatchpos)); end; //Keep track if the maximum possible score //BE CAREFUL IF CHANGING THIS FUNCTION!!! //maxscore:=maxscore+1; inc(maxscore); end; //for position.. end; //for size.. if pass = 1 then begin //swap the strings around swapstring := s1; s1 := s2; s2 := swapstring; end; //Each pass is weighted equally scored := scored + (0.5 * (score / maxscore)); score := 0; maxscore := 0; end; //for pass.. //HowAlike=score/maxscore result := scored; end; Solve 4: A Delphi implementation of the Levenshtein Distance Algorithm unit Levenshtein; {Objeto que calcula la distancia de Levenshtein entre 2 cadenas. Alvaro Jeria Madariaga. 04/10/2002 barbaro@hotpop.com} interface uses sysutils, Math; type Tdistance = class(TObject) private function minimum(a, b, c: Integer): Integer; public function LD(s, t: string): Integer; end; implementation function Tdistance.minimum(a, b, c: Integer): Integer; var mi: Integer; begin mi := a; if (b < mi) then mi := b; if (c < mi) then mi := c; Result := mi; end; function Tdistance.LD(s, t: string): Integer; var d: array of array of Integer; n, m, i, j, costo: Integer; s_i, t_j: char; begin n := Length(s); m := Length(t); if (n = 0) then begin Result := m; Exit; end; if m = 0 then begin Result := n; Exit; end; setlength(d, n + 1, m + 1); for i := 0 to n do d[i, 0] := i; for j := 0 to m do d[0, j] := j; for i := 1 to n do begin s_i := s[i]; for j := 1 to m do begin t_j := t[j]; if s_i = t_j then costo := 0 else costo := 1; d[i, j] := Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + costo); end; end; Result := d[n, m]; end; end. |