My script reads various file types (TXT, CSV, JSON, DOCX, PDF, XLSX, HTML) and searches for specific words. It works fine for most file types, but for HTML files, it returns fewer results than expected. I use BeautifulSoup to extract text, but the search behavior seems inconsistent for HTML files.
The script uses fuzzy search to find the words like vipus and match them to virus and probably that is what doesn't work for HTML.
The content of each file is a list of words, as follows:
hello
example
virus
help
try
change
world
fullvirustry
ViRus
sp AcE
hy-ph-en
vipus
The code that reads from the file and searches for a string is the following one:
import os # To access the directories and files on the PC individually
import re # To search words easier
from collections import defaultdict # Won't raise an error if the key doesn't exist
from bs4 import BeautifulSoup # To work with Html files
from fuzzywuzzy import fuzz # To filter and match words with functions
# remove extra spaces,conv to lower.
def normalize_content(content):
content = re.sub(r"[^\w\s@]", "", content)
return re.sub(r"\s+", " ", content).strip().lower()
# handlers for different file types
def handle_txt(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
return normalize_content(file.read())
except (UnicodeDecodeError, FileNotFoundError) as error:
print(f"Error reading TXT file {file_path}: {str(error)}")
return None
def handle_html(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
all_words = BeautifulSoup(file, "html.parser")
return normalize_content(all_words.get_text())
except Exception as error:
print(f"Error reading HTML file {file_path}: {str(error)}")
return None
# File handlers connected to the file extensions
File_Handlers = {
".txt": handle_txt,
".html": handle_html,
}
# read a file using the correct handler and say if the ext is not supported.
def read_file(file_path):
_, ext = os.path.splitext(file_path)
handler = File_Handlers.get(ext)
if handler:
return handler(file_path)
else:
print(f"Unsupported file type: {ext}. Cannot read file: {file_path}.")
return None
# Counts search string occurrences and more...
def search_file(file_content, search_string, case_insensitive):
if not search_string.strip():
print("Error: The search string cannot be empty.")
return 0
if case_insensitive:
file_content = normalize_content(file_content)
search_string = search_string.lower()
return file_content.count(search_string)
# Perform a fuzzy search and tells how much is the match % is matching with the string
def fuzzy_search(file_content, search_string, matching_score):
matches = 0
# Finds all words in the text
words = re.findall(r"\b\w+\b", file_content)
for word in words:
score = fuzz.ratio(search_string.lower(), word.lower())
if score >= matching_score:
matches += 1
print(f"Match: '{word}' with score {score}%")
return matches
def search_in_directory(
directory_path,
search_string,
case_insensitive,
use_fuzzy=False,
fuzzy_matching_score=60,
):
found_files = defaultdict(int)
fuzzy_found_files = defaultdict(int)
unsupported_files = []
supported_file_count = 0
ignored_files = lambda file_name: file_name.startswith(".")
for root, dirs, files in os.walk(directory_path):
for file_name in files:
if ignored_files(file_name):
continue
file_path = os.path.join(root, file_name)
_, ext = os.path.splitext(file_path)
# Check if file is supported
if ext not in File_Handlers:
unsupported_files.append(file_path)
continue
# Process supported files
file_content = read_file(file_path)
if not file_content:
continue
supported_file_count += 1
# Normal search
normal_matches = search_file(file_content, search_string, case_insensitive)
if normal_matches > 0:
found_files[file_path] += normal_matches
# Fuzzy search (if enabled)
if use_fuzzy:
fuzzy_matches = fuzzy_search(
file_content, search_string, fuzzy_matching_score
)
if fuzzy_matches > 0:
fuzzy_found_files[file_path] += fuzzy_matches
# Combine results
all_matches = {**found_files, **fuzzy_found_files}
if all_matches:
print(f"\nFound the string '{search_string}' in the following files:")
for file, count in all_matches.items():
print(f"{file}: {count} occurrence(s)")
else:
print(f"\nThe string '{search_string}' was not found in any file.")
# Display the summary
print(f"\nTotal supported files processed: {supported_file_count}")
print(f"Unsupported files encountered: {len(unsupported_files)}")
if unsupported_files:
print("Unsupported files:")
for file in unsupported_files:
print(file)
print(f"Total matches found: {sum(all_matches.values())}")
# Main function to get the user inputs and more...
def main():
directory_path = input("Enter the directory path: ").strip()
if not os.path.isdir(directory_path):
print("Error: Directory does not exist.")
return
search_string = input("Enter the string to search for: ").strip()
if not search_string:
print("Error: The search string cannot be empty.")
return
case_insensitive = (
input("Case-insensitive search? (yes/no): ").lower().strip() == "yes"
)
fuzzy_search = (
input("Would you like to use fuzzy search? (yes/no): ").lower().strip() == "yes"
)
fuzzy_matching_score = 60
if fuzzy_search:
while True:
try:
fuzzy_matching_score = int(
input(
"Enter the fuzzy match minimum % (0-100) (default-60%): "
).strip()
)
if 0 <= fuzzy_matching_score <= 100:
break
else:
print("Error: Please enter a valid percentage between 0 and 100.")
except ValueError:
print("Error: Please enter a valid number between 0 and 100.")
search_in_directory(
directory_path,
search_string,
case_insensitive,
fuzzy_search,
fuzzy_matching_score,
)
if __name__ == "__main__":
main()
However, the result when running the code and searching for the word 'virus' differs from other formats to HTML (4 vs 3 occurrences).
Found the string 'virus' in the following files:
C:\Users\USER\Desktop\example\example.html: 3 occurrence(s)
C:\Users\USER\Desktop\example\example.txt: 4 occurrence(s)
I tried checking if the code related to loading and processing HTML can have something to do with the different results, but apparently it must be something else.
Can someone give me a hint on what the issue could be?
Thanks in advance!