Fast string file searching #42
function StringInFile(strFind, strFileName: string): boolean; const BUFSIZE = 8192; var fstm: TFileStream; numread: Longint; buffer: array [0..BUFSIZE-1] of char; szFind: array [0..255] of char; found: boolean; begin StrPCopy(szFind, strFind); found := False; fstm := TFileStream.Create(strFileName, fmOpenRead); repeat numread := fstrm.Read(Buffer, BUFSIZE); if BMFind(szFind, Buffer, numread) >= 0 then found := True else if numread = BUFSIZE then // more to scan fstm.Position := fstmPosition - (Length(strFind)-1); until found or (numread < BUFSIZE); fstm.Free; Result := found; end;
The reason for backing up fstm.Position by nearly the length of strFind is in case strFind crosses buffer boundaries.
The BMFind function used above is a Boyer-Moore search as shown below. This is the fastest string search known.
function BMFind(szSubStr, buf: PChar; iBufSize: integer): integer; { Returns -1 if substring not found, or zero-based index into buffer if substring found } var iSubStrLen: integer; skip: array [char] of integer; found: boolean; iMaxSubStrIdx: integer; iSubStrIdx: integer; iBufIdx: integer; iScanSubStr: integer; mismatch: boolean; iBufScanStart: integer; ch: char; begin { Initialisations } found := False; Result := -1; { Check if trivial scan for empty string } iSubStrLen := StrLen(szSubStr); if iSubStrLen = 0 then begin Result := 0; Exit end; iMaxSubStrIdx := iSubStrLen - 1; { Initialise the skip table } for ch := Low(skip) to High(skip) do skip[ch] := iSubStrLen; for iSubStrIdx := 0 to (iMaxSubStrIdx - 1) do skip[szSubStr[iSubStrIdx]] := iMaxSubStrIdx - iSubStrIdx; { Scan the buffer, starting comparisons at the end of the substring } iBufScanStart := iMaxSubStrIdx; while (not found) and (iBufScanStart < iBufSize) do begin iBufIdx := iBufScanStart; iScanSubStr := iMaxSubStrIdx; repeat mismatch := (szSubStr[iScanSubStr] <> buf[iBufIdx]); if not mismatch then if iScanSubStr > 0 then begin // more characters to scan Dec(iBufIdx); Dec(iScanSubStr) end else found := True; until mismatch or found; if found then Result := iBufIdx else iBufScanStart := iBufScanStart + skip[buf[iBufScanStart]]; end; end;
I have included a wholeword_only flag in the BMFind below. This confirms or rejects the found result, and will cause the loop to keep searching if match is rejected.
function BMFind(szSubStr, buf: PChar; iBufSize: integer; wholeword_only: boolean): integer; { Returns -1 if substring not found, or zero-based index into buffer if substring found } var iSubStrLen: integer; skip: array [char] of integer; found: boolean; iMaxSubStrIdx: integer; iSubStrIdx: integer; iBufIdx: integer; iScanSubStr: integer; mismatch: boolean; iBufScanStart: integer; ch: char; begin found := False; Result := -1; iSubStrLen := StrLen(szSubStr); if iSubStrLen = 0 then begin Result := 0; Exit end; iMaxSubStrIdx := iSubStrLen - 1; { Initialise the skip table } for ch := Low(skip) to High(skip) do skip[ch] := iSubStrLen; for iSubStrIdx := 0 to (iMaxSubStrIdx - 1) do skip[szSubStr[iSubStrIdx]] := iMaxSubStrIdx - iSubStrIdx; { Scan the buffer, starting comparisons at the end of the substring } iBufScanStart := iMaxSubStrIdx; while (not found) and (iBufScanStart < iBufSize) do begin iBufIdx := iBufScanStart; iScanSubStr := iMaxSubStrIdx; repeat mismatch := (szSubStr[iScanSubStr] <> buf[iBufIdx]); if not mismatch then if iScanSubStr > 0 then begin // more characters to scan Dec(iBufIdx); Dec(iScanSubStr) end else found := True; until mismatch or found; if found and wholeword_only then begin if (iBufIdx > 0) then found := not IsCharAlpha(buf[iBufIdx - 1]); if found then if iBufScanStart < (iBufSize - 1) then found := not IsCharAlpha(buf[iBufScanStart + 1]); end; if found then Result := iBufIdx else iBufScanStart := iBufScanStart + skip[buf[iBufScanStart]]; end; end;
Obviously you'll be tempted to increase BUFSIZE on the assumption that it will improve performance. My experience is that it does not, and that 8K is pretty optimum.
Author: | Unknown |
---|---|
Added: | 2007/06/11 |
Last updated: | 2007/06/11 |