I try to define a function calc to execute a for loop. This for loop appends the result calculated for a data frame with a few defined functions: weightedMean, sd_pooled, and summation. In the end, is is expected that the result will be outputted to output2. The code without defined function for the for loop works well. However, my issue is that after executing calc, the output2 is empty and there is no complain. Therefore, I am not able to troubleshooting accordingly. The complete code is as below:
import pandas as pd
import numpy as np
from dplython import X, sift, DplyFrame, mutate, select
from plydata import define, group_by, summarize
def weightedMean(data):
length = len(data['Var1'])
if length == 1:
mx = data['Var1']
return(length)
else:
mx = data['Var1'][0]
nx = data['Var3'][0]
for i in range(1,length):
my = data['Var1'][i]
ny = data['Var3'][i]
nx = nx + ny
mx=(mx*nx+my*ny)/(nx+ny)
return(mx)
def summation(data):
length = len(data['Var3'])
cx = data['Var3'][0]
for i in range(1,length):
cy = data['Var3'][i]
cx = cx + cy
return(cx)
def sd_c(x_m, x_s, x_n, y_m, y_s, y_n):
al = x_n+y_n
tmp_sd = al*((x_n-1)*(x_s*x_s)+(y_n-1)*(y_s*y_s))+y_n*x_n*(x_m-y_m)*(x_m-y_m)
var = tmp_sd/(al*(al-1))
std = np.sqrt(var)
return(std)
def sd_pooled(data):
length = len(data['Var1'])
if length == 1:
mx = data['Var1']
return(length)
else:
mx = data['Var1'][0]
sx = data['Var2'][0]
nx = data['Var3'][0]
for i in range(1,length):
my = data['Var1'][i]
sy = data['Var2'][i]
ny = data['Var3'][i]
sx = sd_c(mx, sx, nx, my, sy, ny)
nx = nx + ny
mx = (mx*nx + my*ny)/(nx + ny)
return(sx)
dat = pd.read_csv("input.txt",sep="\t")
dat = {
'Group': ['A','A','A','A','A','A','A','A','A','A'],
'Process': [3,3,3,3,3,3,841,841,841,841],
'Category': ['cat1','cat1','cat1','cat1','cat1','cat1','cat2','cat2','cat2','cat2'],'Type': ['type1','type1','type1','type1','type1','type1','type2','type2','type2','type2'],
'Var1': [86.84,103.39,109.00,107.30,123.09,111.98,87.62,87.40,88.53,85.84],
'Var2': [2.913,2.835,1.478,2.979,2.424,7.462,3.049,4.781,3.025,2.703],
'Var3': [0.01096,0.00564,0.00365,0.00631,0.00531,0.00332,0.01195,0.00930,0.00697,0.00697]
}
dat = pd.DataFrame(dat)
dat_name = dat.loc[:,'Type'].unique()
dat = DplyFrame(dat)
output = pd.DataFrame([])
def calc(dat_name, dat, output):
out = pd.DataFrame([])
for i in range(len(dat_name)):
df = (dat >>
sift(X.Type == dat_name[i]) >>
mutate(Var3 = X.Var3*3021) >>
sift(X.Var2 < 50))
out = out.append(df)
out_grouped = out.groupby(['Group', 'Type', 'Process', 'Category'])
init = []
mean = []
stdv = []
freq = []
kmer = []
for name, group in out_grouped:
group = pd.DataFrame(group).reset_index()
nm = name
wm = weightedMean(group)
sd = sd_pooled(group)
fq = summation(group)
init.append(nm)
mean.append(wm)
freq.append(fq)
stdv.append(sd)
init = pd.DataFrame(init)
mean = pd.DataFrame(mean)
freq = pd.DataFrame(freq)
stdv = pd.DataFrame(stdv)
init.rename(columns={0:'Group',1:'Type',2:'Process',3:'Category'}, inplace=True)
mean.rename(columns={0:'Var1'}, inplace=True)
stdv.rename(columns={0:'Var2'}, inplace=True)
freq.rename(columns={0:'Var3'}, inplace=True)
combined = pd.concat([init.reset_index(drop=True), mean, stdv, freq], axis=1)
output = output.append(combined)
output2 = calc(dat_name, dat, output)
This expected output is as below:
Group Type Process Category Var1 Var2 Var3
0 A type1 3 cat1 101.207332 13.997181 106.30899
1 A type2 841 cat2 87.431341 3.584393 106.30899
I wonder how I can get the calc work successfully in this case. Thank you.