I created function that iterates over a folder containing excel files and creates a list of all the headers across all sheets. It works fine but is VERY slow. Do you have any ideas on how to improve it? THANKS!
import glob
# file directory
path = r'C:\Users\John\Excel_folder'
all_files = glob.glob(path + "/*.xlsx")
def get_columns(file):
sheets = pd.ExcelFile(file).sheet_names
for sheet in sheets:
for i in (list(pd.read_excel(file, sheet, nrows=0).columns)):
col.append(i)
col=[]
for i in all_files:
get_columns(i)
col