I have a Python code, which is class to perform calculations. The class takes data from csv file, performs calculations and writes it back to a csv file. Unfortunately, when I am passing huge population (20K rows and 50 columns) it takes about an hour. Is it possible to increase the speed, or should I change the script, from working with pandas series to a numpy arrays?Here is an example of my code , beginning part and last part. In total there are 500 lines of code. The methods in the class are very simple and basic:
import pandas as pd
class Asset:
def __init__(self, filename):
#read csv data
self.data=pd.read_csv(filename)
self.filename=filename
self.data=self.data[['month','loan_number','Type','UPB','current_interest_rate','DQ','p']]
def get_prn_bal(self,p, UPB):
return p * UPB
def get_int(self, prn_bal, current_interest_rate, month, dq):
return prn_bal*(current_interest_rate/12 *(month + dq + 1))
def get_cf(self,prn_bal, current_interest_rate, month, dq):
return prn_bal*(1+ current_interest_rate/12 *(month + dq + 1))
....
.... ....
def get_data(self):
self.data['Prn_Bal'] = self.data.apply(lambda x: self.get_prn_bal(x['p'], x['UPB']), axis=1)
self.data['Interest'] = self.data.apply(lambda x: self.get_int(x['Prn_Bal'],x['current_interest_rate'], x['month'], x['DQ']), axis=1)
self.data['CF'] = self.data.apply(lambda x: self.get_cf(x['Prn_Bal'], x['current_interest_rate'],x['month'], x['DQ']), axis=1)
f = open(self.filename,"w+")
f.truncate()
f.close()
self.data.to_csv(self.filename, header=True, index=False)