I'm trying to loop through around 100k files in varying degrees of nested directories using vba and md5 hashing each of those files. Right now, the current state of the program takes around 1.5 to 2 hours to run completely. What can I do to increase the speed it is running at?
Option Explicit
Public Const OriginalFolder As String = "\\path\to\original\"
Public Const NewFolder As String = "\\path\to\the\copy\"
Public Sub runRec()
Dim StartTime As Double
StartTime = Timer
Dim originalFiles As Dictionary
Set originalFiles = New Dictionary
recurse OriginalFolder, OriginalFolder, originalFiles
Debug.Print "Runtime to first recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Debug.Print "Original File Count: " & originalFiles.count
Dim newFiles As Dictionary
Set newFiles = New Dictionary
recurse NewFolder, NewFolder, newFiles
Debug.Print "Runtime to second recurse: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Debug.Print "Copy File Count: " & newFiles.count
'Remove matching md5 hashes
Dim originalKey As Variant
For Each originalKey In originalFiles.Keys
If newFiles.Exists(originalKey) Then
originalFiles.Remove originalKey
newFiles.Remove originalKey
End If
Next
Debug.Print "Runtime to md5: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
Dim originalColl As Collection
Set originalColl = New Collection
Dim copyColl As New Collection
Set copyColl = New Collection
'Remove any files with same filename and relative path
Dim copyKey As Variant
For Each originalKey In originalFiles.Keys
For Each copyKey In newFiles.Keys
If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
originalColl.Add originalKey
copyColl.Add copyKey
End If
Next
Next
For Each originalKey In originalColl
originalFiles.Remove originalKey
Next
For Each copyKey In copyColl
newFiles.Remove copyKey
Next
Debug.Print "Runtime to relative: " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
'Report any files that have the same name but different location as an anomaly
prepDictionary originalFiles
prepDictionary newFiles
Set originalColl = New Collection
For Each originalKey In originalFiles.Keys
For Each copyKey In newFiles.Keys
If originalFiles.item(originalKey) = newFiles.item(copyKey) Then
originalColl.Add originalKey
End If
Next
Next
Debug.Print "Anomaly count: " & originalColl.count
Debug.Print "Runtime to Anomaly : " & Format((Timer - StartTime) / 86400, "hh:mm:ss")
'Report missing files
Debug.Print "Original File Count: " & originalFiles.count
Debug.Print "Copy File Count: " & newFiles.count
End Sub
Private Sub prepDictionary(ByRef dict As Dictionary)
Dim dictKey As Variant
Dim slashLocation As Long
For Each dictKey In dict.Keys
slashLocation = InStrRev(dict.item(dictKey), "\")
dict.item(dictKey) = Right(dict.item(dictKey), Len(dict.item(dictKey)) - slashLocation)
Next
End Sub
Private Sub recurse(ByVal basePath As String, ByVal folderPath As String, ByRef fileDict As Dictionary)
Dim FSO As FileSystemObject
Dim currentFolder As Folder
Dim subFolder As Folder
Dim folderFile As File
Set FSO = New FileSystemObject
Set currentFolder = FSO.GetFolder(folderPath)
Dim fileHash As String
For Each folderFile In currentFolder.Files
'Need to ignore files that dont have a size and lock files
If FileLen(folderFile.path) > 0 And Left(folderFile.Name, 2) <> "~$" And folderFile.Name <> "Thumbs.db" Then
fileHash = FileToMD5Hex(folderFile.path)
If Not fileDict.Exists(fileHash) Then
fileDict.Add fileHash, Right(folderFile.path, Len(folderFile.path) - Len(basePath))
End If
End If
Next
For Each subFolder In currentFolder.SubFolders
recurse basePath, subFolder.path, fileDict
Next
End Sub
Public Function FileToMD5Hex(sFileName As String) As String
Dim enc
Dim bytes
Dim outstr As String
Dim pos As Integer
Set enc = CreateObject("System.Security.Cryptography.MD5CryptoServiceProvider")
'Convert the string to a byte array and hash it
bytes = GetFileBytes(sFileName)
bytes = enc.ComputeHash_2((bytes))
'Convert the byte array to a hex string
For pos = 1 To LenB(bytes)
outstr = outstr & LCase(Right("0" & Hex(AscB(MidB(bytes, pos, 1))), 2))
Next
FileToMD5Hex = outstr
Set enc = Nothing
End Function
Private Function GetFileBytes(ByVal path As String) As Byte()
Dim lngFileNum As Long
Dim bytRtnVal() As Byte
lngFileNum = FreeFile
If LenB(Dir(path, vbNormal + vbReadOnly + vbHidden)) Then ''// Does file exist?
Open path For Binary Access Read As lngFileNum
ReDim bytRtnVal(LOF(lngFileNum) - 1&) As Byte
Get lngFileNum, , bytRtnVal
Close lngFileNum
Else
Err.Raise 53
End If
GetFileBytes = bytRtnVal
Erase bytRtnVal
End Function
Erraction is a red flag unless properly explained. \$\endgroup\$