Define function for "for" loop to output data frame in Python

Question

I try to define a function calc to execute a for loop. This for loop appends the result calculated for a data frame with a few defined functions: weightedMean, sd_pooled, and summation. In the end, is is expected that the result will be outputted to output2. The code without defined function for the for loop works well. However, my issue is that after executing calc, the output2 is empty and there is no complain. Therefore, I am not able to troubleshooting accordingly. The complete code is as below:

import pandas as pd
import numpy as np
from dplython import X, sift, DplyFrame, mutate, select
from plydata import define, group_by, summarize

def weightedMean(data):
        length = len(data['Var1'])
        if length == 1:
                mx = data['Var1']
                return(length)
        else:
                mx = data['Var1'][0]
                nx = data['Var3'][0]
                for i in range(1,length):
                        my = data['Var1'][i]
                        ny = data['Var3'][i]
                        nx = nx + ny
                        mx=(mx*nx+my*ny)/(nx+ny)
                return(mx)

def summation(data):
        length = len(data['Var3'])
        cx = data['Var3'][0]
        for i in range(1,length):
                cy = data['Var3'][i]
                cx = cx + cy
        return(cx)

def sd_c(x_m, x_s, x_n, y_m, y_s, y_n):
        al = x_n+y_n
        tmp_sd = al*((x_n-1)*(x_s*x_s)+(y_n-1)*(y_s*y_s))+y_n*x_n*(x_m-y_m)*(x_m-y_m)
        var = tmp_sd/(al*(al-1))
        std = np.sqrt(var)
        return(std)

def sd_pooled(data):
        length = len(data['Var1'])
        if length == 1:
                mx = data['Var1']
                return(length)
        else:
                mx = data['Var1'][0]
                sx = data['Var2'][0]
                nx = data['Var3'][0]
                for i in range(1,length):
                        my = data['Var1'][i]
                        sy = data['Var2'][i]
                        ny = data['Var3'][i]
                        sx = sd_c(mx, sx, nx, my, sy, ny)
                        nx = nx + ny
                        mx = (mx*nx + my*ny)/(nx + ny)
                return(sx)

dat = pd.read_csv("input.txt",sep="\t")

dat = {
'Group': ['A','A','A','A','A','A','A','A','A','A'],
'Process': [3,3,3,3,3,3,841,841,841,841],
'Category': ['cat1','cat1','cat1','cat1','cat1','cat1','cat2','cat2','cat2','cat2'],'Type': ['type1','type1','type1','type1','type1','type1','type2','type2','type2','type2'],
'Var1': [86.84,103.39,109.00,107.30,123.09,111.98,87.62,87.40,88.53,85.84],
'Var2': [2.913,2.835,1.478,2.979,2.424,7.462,3.049,4.781,3.025,2.703],
'Var3': [0.01096,0.00564,0.00365,0.00631,0.00531,0.00332,0.01195,0.00930,0.00697,0.00697]
}
dat = pd.DataFrame(dat)

dat_name = dat.loc[:,'Type'].unique()
dat = DplyFrame(dat)
output = pd.DataFrame([])
def calc(dat_name, dat, output):
        out = pd.DataFrame([])
        for i in range(len(dat_name)):
                df = (dat >>
                        sift(X.Type == dat_name[i]) >>
                        mutate(Var3 = X.Var3*3021) >>
                        sift(X.Var2 < 50))
                out = out.append(df)
                out_grouped = out.groupby(['Group', 'Type', 'Process', 'Category'])
                init = []
                mean = []
                stdv = []
                freq = []
                kmer = []
                for name, group in out_grouped:
                        group = pd.DataFrame(group).reset_index()
                        nm = name
                        wm = weightedMean(group)
                        sd = sd_pooled(group)
                        fq = summation(group)
                        init.append(nm)
                        mean.append(wm)
                        freq.append(fq)
                        stdv.append(sd)
                init = pd.DataFrame(init)
                mean = pd.DataFrame(mean)
                freq = pd.DataFrame(freq)
                stdv = pd.DataFrame(stdv)
                init.rename(columns={0:'Group',1:'Type',2:'Process',3:'Category'}, inplace=True)
                mean.rename(columns={0:'Var1'}, inplace=True)
                stdv.rename(columns={0:'Var2'}, inplace=True)
                freq.rename(columns={0:'Var3'}, inplace=True)
                combined = pd.concat([init.reset_index(drop=True), mean, stdv, freq], axis=1)
                output = output.append(combined)

output2 = calc(dat_name, dat, output)

This expected output is as below:

  Group   Type  Process Category        Var1       Var2       Var3
0     A  type1        3     cat1  101.207332  13.997181  106.30899
1     A  type2      841     cat2   87.431341   3.584393  106.30899

I wonder how I can get the calc work successfully in this case. Thank you.

Edel · Accepted Answer · 2019-07-24 08:22:19Z

1

I might be wrong, but I guess you should add return output at the end of your calc function.

answered Jul 24, 2019 at 8:22

Edel

925 bronze badges

Sign up to request clarification or add additional context in comments.

1 Comment

Edel Over a year ago

Just click the green check, right below the upvote button next to my answer.

Collectives™ on Stack Overflow

Define function for "for" loop to output data frame in Python

1 Answer 1

1 Comment

Your Answer

Hot Network Questions

Collectives™ on Stack Overflow

1 Answer 1

1 Comment

Your Answer

Sign up or log in

Post as a guest

Related