A much more efficient way is to use multiprocessing – depending on how many cores you have – on my 8-core-pc it's 10-15 times faster. It's quite easy to do, just change the first for loop into map and use multiprocessing.Pool:
global find_sub2
def find_sub2(sub):
sub_pair = [(sub, s) for s in mylist if sub in s and s != sub]
if sub_pair:
return sub_pair
else:
return []
pool = multiprocessing.Pool(processes=16)
substring_superstring_list = pool.map(find_sub2, mylist)
pool.close()
flat_list = [item for sublist in substring_superstring_list for item in sublist]
I have compared the times of some methods (based on your and from other answers) with a list of 20000 random strings of a random size 10-200:
['original', '31.936 seconds']
['traditional_loops', '64.088 seconds']
['two_for_loops', '32.337 seconds'
['sorting', '17.713 seconds']
['with_map', '31.832 seconds']
['map_with_multiprocessing', '3.08 seconds']
Here the code:
from tqdm import tqdm
import multiprocessing
import random
import time
ALLOWED_CHARS = 'abcdeghijklmn'
NUMBER_OF_STRINGS = 20000
MIN_STR_LENGTH = 10
MAX_STR_LENGTH = 100
def random_string_generator(str_size, allowed_chars=ALLOWED_CHARS):
return ''.join(random.choice(allowed_chars) for _ in range(str_size))
print('Creating random strings')
mylist = [random_string_generator(random.randint(MIN_STR_LENGTH, MAX_STR_LENGTH)) for _ in tqdm(range(NUMBER_OF_STRINGS))]
def original():
substring_superstring_list = []
for sub in tqdm(mylist):
sub_pair = [(sub, s) for s in mylist if sub in s and s != sub]
if sub_pair:
substring_superstring_list.append(sub_pair)
return [item for sublist in substring_superstring_list for item in sublist]
def traditional_loops():
output = []
for i in tqdm(range(len(mylist))):
for j in range(len(mylist)):
if i != j and mylist[i] in mylist[j]:
output.append((mylist[i], mylist[j]))
return output
def two_for_loops():
flat_list = []
for x in tqdm(mylist):
for y in mylist:
if x in y and x != y:
flat_list.append((x, y))
return flat_list
def with_map():
def find_sub(sub):
sub_pair = [(sub, s) for s in mylist if sub in s and s != sub]
if sub_pair:
return sub_pair
else:
return []
substring_superstring_list = map(find_sub, tqdm(mylist))
return [item for sublist in substring_superstring_list for item in sublist]
def map_with_multiprocessing():
global find_sub2
def find_sub2(sub):
sub_pair = [(sub, s) for s in mylist if sub in s and s != sub]
if sub_pair:
return sub_pair
else:
return []
pool = multiprocessing.Pool(processes=16)
substring_superstring_list = pool.map(find_sub2, tqdm(mylist))
pool.close()
return [item for sublist in substring_superstring_list for item in sublist]
def sorting():
my_list = sorted(mylist)
return [(x, y) for i, x in enumerate(tqdm(my_list), 1) for y in my_list[i:] if x in y]
methods = [original, traditional_loops, two_for_loops, sorting, with_map, map_with_multiprocessing]
results = []
for fun in methods:
print()
print(f'Start testing {fun.__name__}')
start = time.time()
flat_list = fun()
#print(flat_list)
end = time.time()
result = [fun.__name__, f'{int(1000 * (end - start)) / 1000.} seconds', flat_list]
results.append(result)
solution = (set(results[0][2]), len(results[0][2]))
print()
for i in results:
print(f'{i[:2]} Solution is correct? {solution == (set(i[2]), len(i[2]))}')
trie tree