@@ -876,7 +876,7 @@ def lreshape(data, groups, dropna=True, label=None):
876876 return DataFrame (mdata , columns = id_cols + pivot_cols )
877877
878878
879- def wide_to_long (df , stubnames , i , j , sep = "" , numeric_suffix = True ):
879+ def wide_to_long (df , stubnames , i , j , sep = "" , suffix = '\d+' ):
880880 """
881881 Wide panel to long format. Less flexible but more user-friendly than melt.
882882
@@ -907,8 +907,10 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
907907 in the wide format, to be stripped from the names in the long format.
908908 For example, if your column names are A-suffix1, A-suffix2, you
909909 can strip the hypen by specifying `sep`='-'
910- numeric_suffix : bool, default True
911- Whether the stub suffix is assumed to be numeric or not.
910+ suffix : str default '\d+'
911+ A regular expression capturing the wanted suffixes. '\d+' captures
912+ numeric suffixes. Suffixes with no numbers could be specified with the
913+ negated character class '\D+'.
912914
913915 Returns
914916 -------
@@ -1045,15 +1047,24 @@ def wide_to_long(df, stubnames, i, j, sep="", numeric_suffix=True):
10451047 `pandas.melt` under the hood, but is hard-coded to "do the right thing"
10461048 in a typicaly case.
10471049 """
1050+ def get_var_names (df , stub , sep , suffix ):
1051+ # The first part of this regex is needed to avoid multiple "greedy"
1052+ # matches with stubs that have overlapping substrings. For example
1053+ # A2011, A2012 are separate from AA2011, AA2012. And BBone, BBtwo is
1054+ # different from Bone, Btwo, and BBBrating
1055+ # The last part lets us disambiguate suffixes. For example, with
1056+ # stubname A: (A2011, A2012) would be captured while Arating would
1057+ # be ignored by the numeric class \d+
1058+ regex = "^{0}(?!{1}){2}{3}" .format (
1059+ re .escape (stub ), re .escape (stub [- 1 ]), re .escape (sep ), suffix )
10481060
1049- def get_var_names (df , regex ):
10501061 return df .filter (regex = regex ).columns .tolist ()
10511062
10521063 def melt_stub (df , stub , i , j , value_vars , sep ):
10531064 newdf = melt (df , id_vars = i , value_vars = value_vars ,
10541065 value_name = stub .rstrip (sep ), var_name = j )
10551066 newdf [j ] = Categorical (newdf [j ])
1056- newdf [j ] = newdf [j ].str .replace (re .escape (stub ), "" )
1067+ newdf [j ] = newdf [j ].str .replace (re .escape (stub + sep ), "" )
10571068
10581069 return newdf .set_index (i + [j ])
10591070
@@ -1066,33 +1077,14 @@ def melt_stub(df, stub, i, j, value_vars, sep):
10661077 if not isinstance (i , list ):
10671078 i = [i ]
10681079
1069- stubs = list (map (lambda x : x + sep , stubnames ))
1070-
1071- # This regex is needed to avoid multiple "greedy" matches with stubs
1072- # that have overlapping substrings
1073- # For example A2011, A2012 are separate from AA2011, AA2012
1074- # And BBone, BBtwo is different from Bone, Btwo, and BBBrating
1075- value_vars = list (map (lambda x : get_var_names (
1076- df , "^{0}(?!{1})" .format (re .escape (x ), re .escape (x [- 1 ]))), stubs ))
1080+ value_vars = list (map (lambda stub :
1081+ get_var_names (df , stub , sep , suffix ), stubnames ))
10771082
10781083 value_vars_flattened = [e for sublist in value_vars for e in sublist ]
10791084 id_vars = list (set (df .columns .tolist ()).difference (value_vars_flattened ))
10801085
1081- # If we know the stub end type is a number we can disambiguate potential
1082- # misclassified value_vars, for ex, with stubname A: A2011, A2012 and
1083- # Arating would all be found as value_vars. If the suffix is numeric we
1084- # know the last one should be an id_var. (Note the converse disambiguation
1085- # is not possible)
1086- if numeric_suffix :
1087- for s , v in zip (stubs , value_vars ):
1088- for vname in v [:]:
1089- end = vname .replace (s , "" )
1090- if not end .isdigit ():
1091- v .remove (vname )
1092- id_vars .append (vname )
1093-
10941086 melted = []
1095- for s , v in zip (stubs , value_vars ):
1087+ for s , v in zip (stubnames , value_vars ):
10961088 melted .append (melt_stub (df , s , i , j , v , sep ))
10971089 melted = melted [0 ].join (melted [1 :], how = 'outer' )
10981090
0 commit comments