Instead of mucking around with os.getcwd() and os.listdir, I would recommend to use the (Python 3) pathlib.Pathpathlib.Path object. It supports globbing (to get all files matching a pattern), chaining them to get a new path and even replacing the extension with a different one.
When reading the keywords, you can use a simple list comprehension. Or, even better, a set comprehension to get in calls for free.
line.strip() and line.strip("\n") are probably doing the same thing, unless you really want to preserve the spaces at the end of words.
The csv.writer has a writerowswriterows method that takes an iterable of rows. This way you can avoid a for loop.
from collections import Counter
import csv
from pathlib import Path
import re
import textract
def extract_text(file_name):
return textract.process(file_name, method='tesseract', language='eng',
encoding='utf-8').decode('utf-8')
def extract_words(text):
return re.findall(r'([a-zA-Z]+)', text)
def count_keywords(words, keywords):
word_count =return Counter(words)
return {word: word_count[word] for word in keywordswords if word in word_count}keywords)
def read_keywords(file_name):
with open(file_name) as f:
return {line.strip() for line in f}
def save_keywords(file_name, keywords):
with open(file_name, "w", newline='') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerow(['keyword', 'keyword_count'])
writer.writerows(keywords.items())
def main():
output_folder = Path("output_results")
keywords = read_keywords('keywords.txt')
for f in Path("folderForPdf").glob("*.pdf"):
words = extract_words(extract_text(f))
keyword_counts = count_keywords(words, keywords)
save_keywords(output_folder / f.with_suffix(".csv"), keyword_counts)
if __name__ == "__main__":
main()