Optimizations and Modifications
- Use of Aho-Corasick algorithm to find multiple substrings in the string (much faster than successively checking each substring against string)
- Since diseases and biomarkers are independent, find all diseases and all biomarkers in the string and perform a product of the two results (i.e. find biomarkers once rather than once per disease found)
- OP wants the disease name & id and biomarker CA and Entry_name with each result.
Result:
- 33 X Speed up against single nltk sentence (107 ms vs. 3.54 s)
- 538e3 X Speed up against subsequent nltk sentences (6.57 us vs. 3.54 s)
(when using same keys on subsequent nltk sentences)
New Code
File: main.py
##################################################################
# Imports keyword data from simulate_data module and processes
# on nltk sentence
##################################################################
if __name__ == "__main__":
from process import Finder, format_result
from simulate_data import df_disease, df_biomarker
# Sentence to process
nltk_sentence = 'very hard angiocarcoma diagnosed 3BHS1'
# Set up search engine
finder = Finder(df_disease, df_biomarker)
# Process and loop through results
for d, m in finder.process(nltk_sentence):
format_result(d, m)
File: process.py
from itertools import product
import ahocorasick as ahc
def make_aho_automaton(keywords):
A = ahc.Automaton() # initialize
for (key, cat) in keywords:
A.add_word(key, (cat, key)) # add keys and categories
A.make_automaton() # generate automaton
return A
class Finder():
def __init__(self, df_disease, df_biomarker):
' Initialize Automatons for Disease and biomarkers '
# Disease and biomarker Keywords
# Pairs of (name, id) with name as key and id as category
# note: used underscore on variable names (e.g. _id) to ensure against shadowing builtins
disease_keys = ((_name.lower(), _id) for _name, _id in zip(df_disease['name'], df_disease['id']))
# Pairs of (CA, Entry_name) with CA as key and Entry_name as category
biomarker_keys = (
(_ca_entry, _entry_name)
for _ca, _entry_name in zip(df_biomarker['CA'], df_biomarker['Entry_name'])
for _ca_entry in _ca.split() if len(_ca_entry) >= 3
)
# Create Aho-Corasick automatons
# Surround keywords with space so we find only only word boundaries
self.disease_automaton = make_aho_automaton((f' {keyw} ', cat) for keyw, cat in disease_keys)
self.biomarker_automaton = make_aho_automaton((f' {keyw} ', cat) for keyw, cat in biomarker_keys)
def find_keywords(self, line, A):
' Finds key words in line that exist in Automaton A (as generator)'
# ensure line has space at beginning and end, but remove from key
return ((keyw.strip(), cat) for end_index, (cat, keyw) in A.iter(f' {line} '))
def process(self, nltk_sentence):
sentence = f' {nltk_sentence} ' # ensures sentence as space at both ends (for keyword matching)
for d, m in product(self.find_keywords(sentence, self.disease_automaton),
self.find_keywords(sentence, self.biomarker_automaton)):
yield d, m
def format_result(d, m):
' Format results for printing '
return print(f'Match_found: Disease: name {d[0].title()} with id {d[1]}, Biomarker CA {m[0]} with Entry_name {m[1]}')
File: simulate_data.py
import string
from random import randint, choice
import pandas as pd
def random_word(min_length = 2, max_length = 5, upper = False):
' Generate random word '
if upper:
# Used for biomarker
letters = string.ascii_uppercase + string.digits
else:
# used by disease
letters = string.ascii_lowercase
return ''.join(choice(letters) for _ in range(randint(min_length, max_length)))
def random_sentence(min_length = 1, max_length = 3, upper = False):
'''Random sentence -- upper (True) generates letters and digits,
letters only for lower
'''
return ' '.join(random_word(upper = upper) for _ in range(randint(min_length, max_length)))
diseases = {"id":['00001', '00261'],
'name':['angiocarcoma', 'shrimp allergy']}
biomarkers = {'Entry_name':['TRGV2', 'TRGJ1'],
'CA':['3BHS1 HSD3B1 3BH HSDB3', '3BP1 SH3BP1 IF']}
N = 10000
for i in range(N):
diseases['id'].append(str(i+300).zfill(5))
diseases['name'].append(random_sentence(min_length = 1, max_length = 5, upper = False))
biomarkers['Entry_name'].append(random_word(4, 5, True))
biomarkers['CA'].append(random_sentence(2, 6, True))
df_disease = pd.DataFrame(diseases)
df_biomarker = pd.DataFrame(biomarkers)
Output
Example 1 -- use on a single sentence nltk sentence
nltk_sentence = 'allergy very hard angiocarcoma diagnosed 3BH 3BHS1'
finder = Finder(df_disease, df_biomarker) # Finder for disease and biomarker keys
for d, m in finder.process(nltk_sentence): # apply to nltk_sentence
format_result(d, m)
# Out:
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BH with Entry_name TRGV2
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BHS1 with Entry_name TRGV2
Example 2--looping over multiple nltk sentences
sentences = ("Very hard angiocarcoma diagnosed 3BHS1\n" # Note: Un-Capitalized A in Angiocarcoma
"very hard Angiocarcoma diagnosed IF\n" # Note: Biomarker IF is too short)
"very hard angiocarcoma diagnosed 3BP0\n" # Note: Un-Capitalized A in Angiocarcoma
"Very hard Angiocarcoma diagnosed 3BP0 3BHS1\n" # Note: Capitalized A in Angiocarcoma
"Fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1"
)
for sentence in sentences.split('\n'):
print(sentence)
for d, m in finder.process(sentence):
format_result(d, m)
print()
# Out:
# Very hard angiocarcoma diagnosed 3BHS1
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BHS1 with Entry_name TRGV2
#
very hard Angiocarcoma diagnosed IF
#
very hard angiocarcoma diagnosed 3BP0
#
Very hard Angiocarcoma diagnosed 3BP0 3BHS1
#
# Fish allergy very hard angiocarcoma diagnosed 3BP0 3BHS1
# Match_found: Disease: name Angiocarcoma with id 00001, Biomarker CA 3BHS1 with Entry_name TRGV2
Timing Test
Used a dataset with over 10K records (see simulated_data.py above)
Modification of OP code (to make timing comparison fair)
- Use generators to avoid prints (would slow down timing)
- Use more purposeful names than df and df1 (actually unrelated to timing)
Modified OP code (used to time original)
def disease_search(nltk_tokens_sen):
for dis_index in range(len(df1)):
disease_name = df_disease.at[dis_index,'name']
regex_for_dis = rf"\b{disease_name}\b"
matches_for_dis= re.findall(regex_for_dis, nltk_tokens_sen, re.IGNORECASE | re.MULTILINE)
if len(matches_for_dis)!=0:
yield from disease_marker(nltk_tokens_sen, disease_name)
def disease_marker(nltk_tokens_sen, disease_name):
for zz in range(len(df_biomarkers)):
biomarker_txt=((df_biomarkers.at[zz,'CA']))
biomarker = biomarker_txt.split(" ")
for tt in range(len(biomarker)):
if len(biomarker[tt])>2:
matches_for_marker = re.findall(rf"\b{re.escape(biomarker[tt])}\b", nltk_tokens_sen)
if len(matches_for_marker)!=0:
yield disease_name, biomarker[tt]
Timing Test
Using Jupyter Notebook 'Magic commands'
Timing New Code
%timeit finder = Finder(df_disease, df_biomarker) # Initilization
107 ms ± 4.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit list(finder.process(nltk_sentence)) # Per nltk string
6.57 µs ± 618 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Total: ~107 ms to setup and 6.57 us per nltk sentence
Timing Original Code
%timeit list(disease_search(nltk_sentence))
3.54 s ± 446 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)