Search for duplicate files on large drives (Views: 28)
Problem/Question/Abstract: I have a list of items that I will need to deal with (looking for duplicate files on a drive) that can be upto 2 to 5 million in size. So I will be populating this list and then searching it over and over and over. Normally when dealing with smaller lists like this I would simply use a TStringList and attach an object. However, this seems a little large for TStringList and the main reason, the searching with IndexOf, I don't think is reason enough to use it. So what I am looking for is a list of some sort (TObjectList ?) that is fast and good to deal with and can easily handle this size of entries. It would be really nice it there was a way to create multiple indexes into the in-memory list, as that would greatly speed up my processing of the information. The best solution would be an in-memory database of some sort (at least I think it would be), but my issue with in-memory databases is that of the String sizes for File Name and directory. If I use a regular String variable in an object, then the size can be variable. If I use a standard DB field, then the size in the ones I have seen are all static. So I have to define a huge field to handle all file names which wastes space on all other entries in the list. Any thoughts as the best container to handle this sort of thing? Answer: {$A+,B-,D-,E+,F-,G+,I-,L-,N-,O-,P-,Q-,R-,S-,T-,V-,X-,Y-} program FindDup; {$APPTYPE CONSOLE} // 2002-12-16 - Delphi 32 console app {Usage: FINDDUP [D:] This program finds all duplicate files on the specified drive. The algorithm uses a hash table where each hash value contains a linked list of files that match a given hash. Every file found is inserted into the hash table linked list unless it's already there and therefore a duplicate. All duplicates are added to a red-black tree, where each node contains the file ame and a pointer to a linked list of records containing size, date, and path information. This program can be compiled in either real or protected modes. Protected mode will allow it to search larger drives. Informal benchmarks on a 1.6 Gig drive with > 900 directories, Pentium 90, compiled for p-mode: TEST 16 sec. (simple do-nothing recursive dir searcher) FINDDUP 18 sec. - 16 = 2 sec. REP-DPMI 36 sec. - 16 = 20 sec. Subtracting the overhead of FindFirst/ FindNext, this program runs in about 2 seconds as compared to REP-DPMI's 20 seconds. Without ignoring the overhead, it's still twice as fast. Please note that this program is pure pascal.} {$DEFINE DRBOB} // Do not search hidden directories or find hidden files {.$DEFINE SAFE}// Checks memory allocations for out of memory conditions {.$DEFINE CLEANUP}// Frees allocated memory - slows things down a bit uses SysUtils; const DosDelimSet: set of Char = ['\', ':', #0]; MaxHash = 16381; {largest prime number < 16K} type PathStr = string; NameString = string[12]; St2 = string[2]; pPath = ^PathRecord; {we will only keep one copy of each unique directory path} PathRecord = record Next: pPath; Path: PathStr; end; pDataRec = ^DataRec; {detailed information unique to each dupliacte} DataRec = record Time: longint; Size: longint; Path: pPath; Next: pDataRec; end; link = ^RBTreeNode; RBTreeNode = record {Red/ black tree node} Key: NameString; {Name of file} red: boolean; l, r: link; DataP: pDataRec; {linked list of detail information} end; pFileRec = ^MyFileRec; MyFileRec = record {Hash table record. There will only be one record for each duplicate file name} Name: NameString; Time: longint; Size: longint; Path: pPath; Node: link; {let's quickly insert duplicate into tree} next: pFileRec; {next record in linked list} end; tHashTable = array[0..MAXHASH] of pFileRec; var Head, z: link; HashTable: ^tHashTable; PathHead, TempPathHead: pPath; OldName: NameString; procedure RBTreeInitialize; {Initialize red/ black tree} begin New(Z); {$IFDEF SAFE} if Z = nil then exit; {$ENDIF} z^.l := z; z^.r := z; z^.red := false; New(Head); {$IFDEF SAFE} if Head = nil then exit; {$ENDIF} Head^.r := z; Head^.l := z; Head^.Key := ''; Head^.Red := false; end; function Rotate(const Value: MyFileRec; y: link): link; var c, gc: link; begin if Value.Name < Y^.Key then c := y^.l else c := y^.r; if Value.Name < c^.Key then begin gc := c^.l; c^.l := gc^.r; gc^.r := c; end else begin gc := c^.r; c^.r := gc^.l; gc^.l := c; end; if Value.Name < Y^.Key then y^.l := gc else y^.r := gc; Rotate := gc; end; function Split(const Value: MyFileRec; gg, g, p, x: link): link; begin x^.red := true; x^.l^.red := false; x^.r^.red := false; if p^.red then begin g^.red := true; if (Value.Name < g^.Key) <> (Value.Name < p^.Key) then p := Rotate(Value, g); x := rotate(Value, gg); x^.red := false; end; Head^.r^.red := false; split := x; end; function RBTreeInsert(const Value: MyFileRec; x: link): link; {Insert file record into red/ black tree} var gg, g, p: link; begin p := x; g := x; repeat gg := g; g := p; p := x; if Value.Name < x^.Key then x := x^.l else x := x^.r; if x^.l^.red and x^.r^.red then x := split(Value, gg, g, p, x); until x = z; new(x); {$IFDEF SAFE} if x = nil then exit; {$ENDIF} x^.Key := Value.Name; New(x^.DataP); {$IFDEF SAFE} if x^.DataP = nil then exit; {$ENDIF} x^.DataP^.Next := nil; x^.DataP^.Time := Value.Time; x^.DataP^.Size := Value.Size; x^.DataP^.Path := Value.Path; x^.l := z; x^.r := z; if Value.Name < p^.Key then p^.l := x else p^.r := x; RbTreeInsert := x; x := Split(Value, gg, g, p, x); end; procedure Traverse(p: link); {Traverse red/ black tree, printing out results} var TempQ, q: pDataRec; begin if (p^.l <> z) and (p^.l <> nil) then Traverse(p^.l); if (p <> head) then begin if p^.Key <> OldName then begin OldName := p^.Key; writeln(OldName); end; q := p^.DataP; while q <> nil do begin with q^ do writeln(size: 10, ' ', FormatDateTime('yyyy-mm-dd hh:nn:ss', FileDateToDateTime(Time)), ' ', Path^.Path); {$IFDEF CLEANUP} TempQ := q; q := q^.Next; Dispose(TempQ); {$ELSE} q := q^.Next; {$ENDIF} end; writeln; end; if (p^.r <> z) and (p^.r <> nil) then Traverse(p^.r); {$IFDEF CLEANUP} Dispose(p); {$ENDIF} end; function AddBackSlash(const DirName: string): string; {Add a default backslash to a directory name} begin if DirName[Length(DirName)] in DosDelimSet then AddBackSlash := DirName else AddBackSlash := DirName + '\'; end; function Hash(const Key: NameString): word; var h: word; j: integer; Len: integer; begin Len := Length(Key); h := ord(Key[1]); for j := 2 to Len do begin h := ((h * 32) + Ord(Key[j])) mod MAXHASH; end; Hash := h; end; procedure Add(var SR: tSearchRec; DirP: pPath); {Add a new search record/ path to the hash table} var p, q, r: pFileRec; h: word; TempData: pDataRec; begin h := Hash(SR.Name); New(r); {$IFDEF SAFE} if r = nil then exit; {$ENDIF} r^.Name := SR.Name; r^.Time := SR.Time; r^.Size := SR.Size; r^.Path := DirP; r^.Next := nil; r^.Node := nil; p := HashTable^[H]; if p = nil then begin {Hash slot not used} HashTable^[h] := r; end else begin q := p; while (p <> nil) and (p^.Name < SR.Name) do begin q := p; p := p^.Next; end; if (p <> nil) and (p^.Name = SR.Name) then begin {Found duplicate file} if p^.Node = nil then begin {was not already in tree} p^.Node := RBTreeInsert(p^, Head); {save link so we don't have to search tree next time} end; New(TempData); {$IFDEF SAFE} if TempData = nil then exit; {$ENDIF} TempData^.Time := Sr.Time; TempData^.Size := Sr.Size; TempData^.Path := DirP; TempData^.Next := p^.Node^.DataP; {Add to linked list on tree node} p^.Node^.DataP := TempData; Dispose(r); {didn't need it after all} end else begin {Not a duplicate} if p = q then begin {Add at start of linked list} HashTable^[H] := r; r^.Next := P; end else begin {Insert into linked list} q^.Next := r; r^.Next := p; end; end; end; end; procedure Find(const Path: PathStr); {Recursive file/directory searcher} var Sr: tSearchRec; DirP: pPath; r: integer; begin New(DirP); {$IFDEF SAFE} if DirP = nil then exit; {$ENDIF} DirP^.Path := Path; DirP^.Next := PathHead; PathHead := DirP; r := FindFirst(AddBackSlash(Path) + '*.*', faAnyFile, Sr); while r = 0 do begin {$IFDEF DRBOB} {only do non-hidden directories} if ((Sr.Attr and faDirectory) <> 0) and ((Sr.Attr and faHidden) = 0) then begin {$ELSE} {do them all} if (Sr.Attr and Directory) <> 0 then begin {$ENDIF} if Sr.Name[1] <> '.' then Find(AddBackSlash(Path) + Sr.Name); r := 0; end else begin {$IFDEF DRBOB} if (Sr.Attr and faHidden) = 0 then {Only do non-hidden files} {$ENDIF} Add(Sr, DirP); end; r := FindNext(Sr); end; end; function HeapFunc(Size: Word): Integer; far; begin HeapFunc := 1; end; procedure Init; begin OldName := ''; PathHead := nil; {$IFDEF SAFE} HeapError := @HeapFunc; {$ENDIF} New(HashTable); {$IFDEF SAFE} if HashTable = nil then halt; {$ENDIF} FillChar(HashTable^, sizeof(HashTable^), 0); RBTreeInitialize; end; procedure Process; begin Find(ParamStr(1) + '\'); Traverse(Head); end; procedure Done; var i: integer; q, tempq: pFileRec; begin {$IFDEF CLEANUP} Dispose(Z); for i := 0 to MAXHASH - 1 do begin if HashTable^[i] <> nil then begin q := HashTable^[i]; while q <> nil do begin tempq := q^.next; Dispose(q); q := tempq; end; end; end; Dispose(HashTable); TempPathHead := PathHead; while PathHead <> nil do begin TempPathHead := PathHead^.Next; FreeMem(PathHead, Length(PathHead^.Path) + 5); PathHead := TempPathHead; end; {$ENDIF} end; begin Init; Process; Done; end. |