I wrote a piece of code which parses through around a hundred XML files and creates a single dataframe. The code works fine but can take a pretty long time, a little less than an hour to run. I'm sure there is a way to improve this loop by only using dataframe objects at the end of the loop, or maybe you don't need the triple-nested loop to parse all the info into the dataframe, but this is the only way I was able to do it as a novice.
My code looks like this :
from bs4 import BeautifulSoup
import pandas as pd
import lxml
import json
import os
os.chdir(r"path_to_output_file/output_file")
f_list = os.listdir()
df_list = []
output_files = []
# checking we only iterate over XML files containing "calc_output"
for calc_output in f_list:
if "calc_output" in calc_output and calc_output.endswith(".xml"):
output_files.append(calc_output)
for calc_output in output_files:
with open(calc_output, "r") as datas:
print(f"reading file {calc_output} ...")
doc = BeautifulSoup(datas.read(), "lxml")
rows = []
timestamps = doc.time.find_all("timestamp")
for timestamp in timestamps: # parsing through every timestamp element
row= {}
time = timestamp.get("time") # reading timestamp attributes
temperature = timestamp.get("temperature")
zone_id = doc.zone.get("zone_id")
time_id = timestamp.get("time_id")
row.update({"time":time, "temperature":temperature, "time_id":time_id, "zone_id":zone_id})
row_copy = row.copy()
rows.append(row_copy)
# creating temporary dataframe to combine with other info
df1 = pd.DataFrame(rows)
rows= []
surfacedatas = doc.surfacehistory.find_all("surfacedata")
for surfacedata in surfacedatas:
row= {}
#parsing through every surfacedata element
time_begin = surfacedata.get("time-begin")
time_end = surfacedata.get("time-end")
row={"time-begin":time_begin, "time-end":time_end}
things = surfacedata.find_all("thing", recursive=False)
#parsing through every thing in each surfacedata
for thing in things:
identity = id2name(thing.get("identity"))
row.update({"identity":identity})
locations = thing.find_all("loc ation", recursive=False)
for location in locations:
#parsing through every location for every thing for each surfacedata
l_identity = location.get("l_identity")
surface = location.getText()
row.update({"l_identity":l_identity, "surface":surface})
row_copy = row.copy()
rows.append(row_copy)
df2 = pd.DataFrame(rows) # second dataframe containing the information needed
#merging each dataframe on every loop
df =pd.merge(df1,df2, left_on="time_id", right_on="time-begin")
# then appending it to a list
df_list.append(df)
# final dataframe created by concatenating each dataframe from each output file
df = pd.concat(df_list)
df
An example of an XML file would be :
file 1
<file filename="stack_example_1" created="today">
<unit time="day" volume="cm3" surface="cm2"/>
<zone zone_id="10">
<time>
<timestamp time_id="1" time="0" temperature="100"/>
<timestamp time_id="2" time="10.00" temperature="200"/>
</time>
<surfacehistory type="calculation">
<surfacedata time-begin="1" time-end="2">
<thing identity="1">
<location l_identity="2"> 1.256</location>
<location l_identity="45"> 2.3</location>
</thing>
<thing identity="3">
<location l_identity="2"> 1.6</location>
<location l_identity="5"> 2.5</location>
<location l_identity="78"> 3.2</location>
</thing>
</surfacedata>
<surfacedata time-begin="2" time-end="3">
<thing identity="1">
<location l_identity="17"> 2.4</location>
</thing>
</surfacedata>
</surfacehistory>
</zone>
</file>
file 2
<file filename="stack_example_2" created="today">
<unit time="day" volume="cm3" surface="cm2"/>
<zone zone_id="11">
<time>
<timestamp time_id="1" time="0" temperature="100"/>
<timestamp time_id="2" time="10.00" temperature="200"/>
</time>
<surfacehistory type="calculation">
<surfacedata time-begin="1" time-end="2">
<thing identity="1">
<location l-identity="2"> 1.6</location>
<location l-identity="45"> 2.6</location>
</thing>
<thing identity="3">
<location l-identity="2"> 1.4</location>
<location l-identity="8"> 2.7</location>
</thing>
</surfacedata>
<surfacedata time-begin="2" time-end="3">
<thing identity="1">
<location l-identity="9"> 2.8</location>
<location l-identity="17"> 1.2</location>
</thing>
</surfacedata>
</surfacehistory>
</zone>
</file>
The output of this code using file 1 and file 2 would be :
zone_id time time_id temperature tid-begin tid-end identity location surface
10 0 1 100 1 2 1 2 1,256
10 0 1 100 1 2 1 2 2,3
10 0 1 100 1 2 3 2 1,6
10 0 1 100 1 2 3 5 2,5
10 0 1 100 1 2 3 78 3,2
10 10 2 200 2 3 1 17 2,4
11 0 1 100 1 2 1 2 1,6
11 0 1 100 1 2 1 45 2,6
11 0 1 100 1 2 3 2 1,4
11 0 1 100 1 2 3 8 2,7
11 10 2 200 2 3 1 9 2,8
11 10 2 200 2 3 1 17 1,2
Here is the output obtained after running cProfile :
Ordered by: internal time
List reduced from 6281 to 20 due to restriction <20>
ncalls tottime percall cumtime percall filename:lineno(function)
214204 95.337 0.000 95.340 0.000 C:\Users\anon\Anaconda3\lib\json\decoder.py:343(raw_decode)
214389 20.685 0.000 21.386 0.000 {built-in method io.open}
214288 17.945 0.000 17.945 0.000 {built-in method _codecs.charmap_decode}
1 16.745 16.745 336.360 336.360 .\anon_programm.py:7(<module>)
10 15.378 1.538 132.814 13.281 C:\Users\anon\Anaconda3\lib\site-packages\bs4\builder\_lxml.py:330(feed)
10277616 12.975 0.000 44.266 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\__init__.py:555(endData)
214228 12.504 0.000 30.575 0.000 {method 'read' of '_io.TextIOWrapper' objects}
3425862 11.257 0.000 75.608 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\builder\_lxml.py:223(start)
6851244 10.806 0.000 19.427 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\__init__.py:589(object_was_parsed)
17128360 8.580 0.000 8.580 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\element.py:158(setup)
3425862 8.389 0.000 8.694 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\__init__.py:527(popTag)
5961888 7.170 0.000 7.170 0.000 {method 'keys' of 'dict' objects}
3425872 7.072 0.000 23.054 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\element.py:1152(__init__)
214200 5.978 0.000 146.468 0.001 .\anon_programm.py:18(id2name)
3425862 5.913 0.000 61.118 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\__init__.py:691(handle_starttag)
3425002 4.482 0.000 12.571 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\builder\__init__.py:285(_replace_cdata_list_attribute_values)
3425862 4.326 0.000 37.251 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\builder\_lxml.py:278(end)
3425862 4.244 0.000 13.552 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\__init__.py:657(_popToTag)
2751774 4.240 0.000 6.154 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\element.py:808(<genexpr>)
6851244 3.869 0.000 8.629 0.000 C:\Users\anon\Anaconda3\lib\site-packages\bs4\element.py:932(__new__)
Here is the function that is called a lot in the loop :
import functools
@functools.lru_cache(maxsize=1000)
def id2name(id):
name_Dict = json.loads( open(r"path_to_JSON_file\file.json","r").read() )
name = ""
if id.isnumeric():
partial_id = id[:-1]
if partial_id not in name_Dict.keys():
return id
if id[-1] == "0":
return name_Dict[partial_id]
else:
return name_Dict[partial_id]+"x"+id[-1]
else:
return ""
BeautifulSoup, which takes its own sweet time to try to fix invalid markup.output_files = output_files[:50], for instance...json...)tottimeas well, not justcumtime, sincecumtimecontains time spent in functions called by that function...)