From d467a5c249c88512b840cc422e7d4a990fafc331 Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 09:39:28 +0530 Subject: [PATCH 1/3] firt iter on errors --- doc/source/cookbook.rst | 459 +++++++++++++++++++++++----------------- 1 file changed, 269 insertions(+), 190 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 53468e755a722..b73358c67b1ab 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -9,18 +9,15 @@ import numpy as np from pandas.compat import StringIO - import random import os import itertools import functools import datetime - + import glob np.random.seed(123456) - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 - import matplotlib - # matplotlib.style.use('default') np.set_printoptions(precision=4, suppress=True) @@ -56,8 +53,9 @@ These are some neat pandas ``idioms`` .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df if-then... ********** @@ -66,36 +64,41 @@ An if-then on one column .. ipython:: python - df.loc[df.AAA >= 5,'BBB'] = -1; df + df.loc[df.AAA >= 5, 'BBB'] = -1 + df An if-then with assignment to 2 columns: .. ipython:: python - df.loc[df.AAA >= 5,['BBB','CCC']] = 555; df + df.loc[df.AAA >= 5, ['BBB', 'CCC']] = 555 + df Add another line with different logic, to do the -else .. ipython:: python - df.loc[df.AAA < 5,['BBB','CCC']] = 2000; df + df.loc[df.AAA < 5, ['BBB', 'CCC']] = 2000 + df Or use pandas where after you've set up a mask .. ipython:: python - df_mask = pd.DataFrame({'AAA' : [True] * 4, 'BBB' : [False] * 4,'CCC' : [True,False] * 2}) - df.where(df_mask,-1000) + df_mask = pd.DataFrame({'AAA': [True] * 4, 'BBB': [False] * 4, + 'CCC': [True, False] * 2}) + df.where(df_mask, -1000) `if-then-else using numpy's where() `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - - df['logic'] = np.where(df['AAA'] > 5,'high','low'); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df + df['logic'] = np.where(df['AAA'] > 5, 'high', 'low') + df Splitting ********* @@ -105,11 +108,14 @@ Splitting .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - dflow = df[df.AAA <= 5]; dflow - dfhigh = df[df.AAA > 5]; dfhigh + dflow = df[df.AAA <= 5] + dflow + dfhigh = df[df.AAA > 5] + dfhigh Building Criteria ***************** @@ -119,45 +125,50 @@ Building Criteria .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df ...and (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA']; newseries + newseries = df.loc[(df['BBB'] < 25) & (df['CCC'] >= -40), 'AAA'] + newseries ...or (without assignment returns a Series) .. ipython:: python - newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA']; newseries + newseries = df.loc[(df['BBB'] > 25) | (df['CCC'] >= -40), 'AAA'] + newseries ...or (with assignment modifies the DataFrame.) .. ipython:: python - df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1; df + df.loc[(df['BBB'] > 25) | (df['CCC'] >= 75), 'AAA'] = 0.1 + df `Select rows with data closest to certain value using argsort `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df - + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df aValue = 43.0 - df.loc[(df.CCC-aValue).abs().argsort()] + df.loc[(df.CCC - aValue).abs().argsort()] `Dynamically reduce a list of criteria using a binary operators `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df Crit1 = df.AAA <= 5.5 Crit2 = df.BBB == 10.0 @@ -173,8 +184,8 @@ One could hard code: .. ipython:: python - CritList = [Crit1,Crit2,Crit3] - AllCrit = functools.reduce(lambda x,y: x & y, CritList) + CritList = [Crit1, Crit2, Crit3] + AllCrit = functools.reduce(lambda x, y: x & y, CritList) df[AllCrit] @@ -193,18 +204,22 @@ The :ref:`indexing ` docs. .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - df[(df.AAA <= 6) & (df.index.isin([0,2,4]))] + df[(df.AAA <= 6) & (df.index.isin([0, 2, 4]))] `Use loc for label-oriented slicing and iloc positional slicing `__ .. ipython:: python - data = {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]} - df = pd.DataFrame(data=data,index=['foo','bar','boo','kar']); df + data = {'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]} + df = pd.DataFrame(data=data, index=['foo', 'bar', 'boo', 'kar']) + df There are 2 explicit slicing methods, with a third general case @@ -213,33 +228,35 @@ There are 2 explicit slicing methods, with a third general case 3. General (Either slicing style : depends on if the slice contains labels or positions) .. ipython:: python - df.iloc[0:3] #Positional + df.iloc[0:3] # Positional - df.loc['bar':'kar'] #Label + df.loc['bar': 'kar'] # Label # Generic df.iloc[0:3] - df.loc['bar':'kar'] + df.loc['bar': 'kar'] Ambiguity arises when an index consists of integers with a non-zero start or non-unit increment. .. ipython:: python - df2 = pd.DataFrame(data=data,index=[1,2,3,4]); #Note index starts at 1. + df2 = pd.DataFrame(data=data, index=[1, 2, 3, 4]) # Note index starts at 1. - df2.iloc[1:3] #Position-oriented + df2.iloc[1:3] # Position-oriented - df2.loc[1:3] #Label-oriented + df2.loc[1:3] # Label-oriented `Using inverse operator (~) to take the complement of a mask `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40], 'CCC' : [100,50,-30,-50]}); df + df = pd.DataFrame({'AAA': [4, 5, 6, 7], + 'BBB': [10, 20, 30, 40], + 'CCC': [100, 50, -30, -50]}) + df - df[~((df.AAA <= 6) & (df.index.isin([0,2,4])))] + df[~((df.AAA <= 6) & (df.index.isin([0, 2, 4])))] Panels ****** @@ -249,14 +266,18 @@ Panels .. ipython:: python - rng = pd.date_range('1/1/2013',periods=100,freq='D') + rng = pd.date_range('1/1/2013', periods=100, freq='D') data = np.random.randn(100, 4) - cols = ['A','B','C','D'] - df1, df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols) + cols = ['A', 'B', 'C', 'D'] + df1 = pd.DataFrame(data, rng, cols) + df2 = pd.DataFrame(data, rng, cols) + df3 = pd.DataFrame(data, rng, cols) - pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});pf + pf = pd.Panel({'df1': df1, 'df2': df2, 'df3': df3}) + pf - pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf + pf.loc[:, :, 'F'] = pd.DataFrame(data, rng, cols) + pf `Mask a panel by using np.where and then reconstructing the panel with the new masked values `__ @@ -269,22 +290,26 @@ New Columns .. ipython:: python - df = pd.DataFrame( - {'AAA' : [1,2,1,3], 'BBB' : [1,1,2,2], 'CCC' : [2,1,3,1]}); df + df = pd.DataFrame({'AAA': [1, 2, 1, 3], + 'BBB': [1, 1, 2, 2], + 'CCC': [2, 1, 3, 1]}) + df - source_cols = df.columns # or some subset would work too. + source_cols = df.columns # or some subset would work too. new_cols = [str(x) + "_cat" for x in source_cols] - categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' } + categories = {1: 'Alpha', 2: 'Beta', 3: 'Charlie'} - df[new_cols] = df[source_cols].applymap(categories.get);df + df[new_cols] = df[source_cols].applymap(categories.get) + df `Keep other columns when using min() with groupby `__ .. ipython:: python - df = pd.DataFrame( - {'AAA' : [1,1,1,2,2,2,3,3], 'BBB' : [2,1,3,4,5,1,2,3]}); df + df = pd.DataFrame({'AAA': [1, 1, 1, 2, 2, 2, 3, 3], + 'BBB': [2, 1, 3, 4, 5, 1, 2, 3]}) + df Method 1 : idxmin() to get the index of the minimums @@ -312,20 +337,26 @@ The :ref:`multindexing ` docs. .. ipython:: python - df = pd.DataFrame({'row' : [0,1,2], - 'One_X' : [1.1,1.1,1.1], - 'One_Y' : [1.2,1.2,1.2], - 'Two_X' : [1.11,1.11,1.11], - 'Two_Y' : [1.22,1.22,1.22]}); df + df = pd.DataFrame({'row': [0, 1, 2], + 'One_X': [1.1, 1.1, 1.1], + 'One_Y': [1.2, 1.2, 1.2], + 'Two_X': [1.11, 1.11, 1.11], + 'Two_Y': [1.22, 1.22, 1.22]}) + df # As Labelled Index - df = df.set_index('row');df + df = df.set_index('row') + df # With Hierarchical Columns - df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) for c in df.columns]);df + df.columns = pd.MultiIndex.from_tuples([tuple(c.split('_')) + for c in df.columns]) + df # Now stack & Reset - df = df.stack(0).reset_index(1);df + df = df.stack(0).reset_index(1) + df # And fix the labels (Notice the label 'level_1' got added automatically) - df.columns = ['Sample','All_X','All_Y'];df + df.columns = ['Sample', 'All_X', 'All_Y'] + df Arithmetic ********** @@ -335,9 +366,12 @@ Arithmetic .. ipython:: python - cols = pd.MultiIndex.from_tuples([ (x,y) for x in ['A','B','C'] for y in ['O','I']]) - df = pd.DataFrame(np.random.randn(2,6),index=['n','m'],columns=cols); df - df = df.div(df['C'],level=1); df + cols = pd.MultiIndex.from_tuples([(x, y) for x in ['A', 'B', 'C'] + for y in ['O', 'I']]) + df = pd.DataFrame(np.random.randn(2, 6), index=['n', 'm'], columns=cols) + df + df = df.div(df['C'], level=1) + df Slicing ******* @@ -347,44 +381,49 @@ Slicing .. ipython:: python - coords = [('AA','one'),('AA','six'),('BB','one'),('BB','two'),('BB','six')] + coords = [('AA', 'one'), ('AA', 'six'), ('BB', 'one'), ('BB', 'two'), + ('BB', 'six')] index = pd.MultiIndex.from_tuples(coords) - df = pd.DataFrame([11,22,33,44,55],index,['MyData']); df + df = pd.DataFrame([11, 22, 33, 44, 55], index, ['MyData']) + df To take the cross section of the 1st level and 1st axis the index: .. ipython:: python - df.xs('BB',level=0,axis=0) #Note : level and axis are optional, and default to zero + # Note : level and axis are optional, and default to zero + df.xs('BB', level=0, axis=0) ...and now the 2nd level of the 1st axis. .. ipython:: python - df.xs('six',level=1,axis=0) + df.xs('six', level=1, axis=0) `Slicing a MultiIndex with xs, method #2 `__ .. ipython:: python - index = list(itertools.product(['Ada','Quinn','Violet'],['Comp','Math','Sci'])) - headr = list(itertools.product(['Exams','Labs'],['I','II'])) + index = list(itertools.product(['Ada', 'Quinn', 'Violet'], + ['Comp', 'Math', 'Sci'])) + headr = list(itertools.product(['Exams', 'Labs'], ['I', 'II'])) - indx = pd.MultiIndex.from_tuples(index,names=['Student','Course']) - cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named + indx = pd.MultiIndex.from_tuples(index, names=['Student', 'Course']) + cols = pd.MultiIndex.from_tuples(headr) # Notice these are un-named - data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)] + data = [[70 + x + y + (x * y) % 3 for x in range(4)] for y in range(9)] - df = pd.DataFrame(data,indx,cols); df + df = pd.DataFrame(data, indx, cols) + df All = slice(None) df.loc['Violet'] - df.loc[(All,'Math'),All] - df.loc[(slice('Ada','Quinn'),'Math'),All] - df.loc[(All,'Math'),('Exams')] - df.loc[(All,'Math'),(All,'II')] + df.loc[(All, 'Math'), All] + df.loc[(slice('Ada', 'Quinn'), 'Math'), All] + df.loc[(All, 'Math'), ('Exams')] + df.loc[(All, 'Math'), (All, 'II')] `Setting portions of a MultiIndex with xs `__ @@ -422,7 +461,9 @@ Fill forward a reversed timeseries .. ipython:: python - df = pd.DataFrame(np.random.randn(6,1), index=pd.date_range('2013-08-01', periods=6, freq='B'), columns=list('A')) + df = pd.DataFrame(np.random.randn(6, 1), + index=pd.date_range('2013-08-01', periods=6, freq='B'), + columns=list('A')) df.loc[df.index[3], 'A'] = np.nan df df.reindex(df.index[::-1]).ffill() @@ -453,9 +494,10 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to df = pd.DataFrame({'animal': 'cat dog cat fish dog cat cat'.split(), 'size': list('SSMMMLL'), 'weight': [8, 10, 11, 1, 20, 12, 12], - 'adult' : [False] * 5 + [True] * 2}); df + 'adult': [False] * 5 + [True] * 2}) + df - #List the size of the animals with the highest weight. + # List the size of the animals with the highest weight. df.groupby('animal').apply(lambda subf: subf['size'][subf['weight'].idxmax()]) `Using get_group @@ -473,11 +515,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python def GrowUp(x): - avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) - avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) - avg_weight += sum(x[x['size'] == 'L'].weight) - avg_weight /= len(x) - return pd.Series(['L',avg_weight,True], index=['size', 'weight', 'adult']) + avg_weight = sum(x[x['size'] == 'S'].weight * 1.5) + avg_weight += sum(x[x['size'] == 'M'].weight * 1.25) + avg_weight += sum(x[x['size'] == 'L'].weight) + avg_weight /= len(x) + return pd.Series(['L', avg_weight, True], + index=['size', 'weight', 'adult']) expected_df = gb.apply(GrowUp) @@ -488,13 +531,13 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - S = pd.Series([i / 100.0 for i in range(1,11)]) + S = pd.Series([i / 100.0 for i in range(1, 11)]) - def CumRet(x,y): - return x * (1 + y) + def CumRet(x, y): + return x * (1 + y) def Red(x): - return functools.reduce(CumRet,x,1.0) + return functools.reduce(CumRet, x, 1.0) S.expanding().apply(Red, raw=True) @@ -504,7 +547,7 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'A' : [1, 1, 2, 2], 'B' : [1, -1, 1, 2]}) + df = pd.DataFrame({'A': [1, 1, 2, 2], 'B': [1, -1, 1, 2]}) gb = df.groupby('A') def replace(g): @@ -535,15 +578,15 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - rng = pd.date_range(start="2014-10-07",periods=10,freq='2min') - ts = pd.Series(data = list(range(10)), index = rng) + rng = pd.date_range(start="2014-10-07", periods=10, freq='2min') + ts = pd.Series(data=list(range(10)), index=rng) def MyCust(x): - if len(x) > 2: - return x[1] * 1.234 - return pd.NaT + if len(x) > 2: + return x[1] * 1.234 + return pd.NaT - mhc = {'Mean' : np.mean, 'Max' : np.max, 'Custom' : MyCust} + mhc = {'Mean': np.mean, 'Max': np.max, 'Custom': MyCust} ts.resample("5min").apply(mhc) ts @@ -553,7 +596,8 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python df = pd.DataFrame({'Color': 'Red Red Red Blue'.split(), - 'Value': [100, 150, 50, 50]}); df + 'Value': [100, 150, 50, 50]}) + df df['Counts'] = df.groupby(['Color']).transform(len) df @@ -562,11 +606,12 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame( - {u'line_race': [10, 10, 8, 10, 10, 8], - u'beyer': [99, 102, 103, 103, 88, 100]}, - index=[u'Last Gunfighter', u'Last Gunfighter', u'Last Gunfighter', - u'Paynter', u'Paynter', u'Paynter']); df + df = pd.DataFrame({u'line_race': [10, 10, 8, 10, 10, 8], + u'beyer': [99, 102, 103, 103, 88, 100]}, + index=[u'Last Gunfighter', u'Last Gunfighter', + u'Last Gunfighter', u'Paynter', u'Paynter', + u'Paynter']) + df df['beyer_shifted'] = df.groupby(level=0)['beyer'].shift(1) df @@ -575,9 +620,9 @@ Unlike agg, apply's callable is passed a sub-DataFrame which gives you access to .. ipython:: python - df = pd.DataFrame({'host':['other','other','that','this','this'], - 'service':['mail','web','mail','mail','web'], - 'no':[1, 2, 1, 2, 1]}).set_index(['host', 'service']) + df = pd.DataFrame({'host': ['other', 'other', 'that', 'this', 'this'], + 'service': ['mail', 'web', 'mail', 'mail', 'web'], + 'no': [1, 2, 1, 2, 1]}).set_index(['host', 'service']) mask = df.groupby(level=0).agg('idxmax') df_count = df.loc[mask['no']].reset_index() df_count @@ -613,10 +658,12 @@ Create a list of dataframes, split using a delineation based on logic included i .. ipython:: python - df = pd.DataFrame(data={'Case' : ['A','A','A','B','A','A','B','A','A'], - 'Data' : np.random.randn(9)}) + df = pd.DataFrame(data={'Case': ['A', 'A', 'A', 'B', 'A', 'A', 'B', 'A', + 'A'], + 'Data': np.random.randn(9)}) - dfs = list(zip(*df.groupby((1*(df['Case']=='B')).cumsum().rolling(window=3,min_periods=1).median())))[-1] + dfs = list(zip(*df.groupby((1 * (df['Case'] == 'B')).cumsum() + .rolling(window=3, min_periods=1).median())))[-1] dfs[0] dfs[1] @@ -633,10 +680,13 @@ The :ref:`Pivot ` docs. .. ipython:: python - df = pd.DataFrame(data={'Province' : ['ON','QC','BC','AL','AL','MN','ON'], - 'City' : ['Toronto','Montreal','Vancouver','Calgary','Edmonton','Winnipeg','Windsor'], - 'Sales' : [13,6,16,8,4,3,1]}) - table = pd.pivot_table(df,values=['Sales'],index=['Province'],columns=['City'],aggfunc=np.sum,margins=True) + df = pd.DataFrame(data={'Province': ['ON', 'QC', 'BC', 'AL', 'AL', 'MN', 'ON'], + 'City': ['Toronto', 'Montreal', 'Vancouver', + 'Calgary', 'Edmonton', 'Winnipeg', + 'Windsor'], + 'Sales': [13, 6, 16, 8, 4, 3, 1]}) + table = pd.pivot_table(df, values=['Sales'], index=['Province'], + columns=['City'], aggfunc=np.sum, margins=True) table.stack('City') `Frequency table like plyr in R @@ -644,20 +694,26 @@ The :ref:`Pivot ` docs. .. ipython:: python - grades = [48,99,75,80,42,80,72,68,36,78] - df = pd.DataFrame( {'ID': ["x%d" % r for r in range(10)], - 'Gender' : ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M', 'M', 'M'], - 'ExamYear': ['2007','2007','2007','2008','2008','2008','2008','2009','2009','2009'], - 'Class': ['algebra', 'stats', 'bio', 'algebra', 'algebra', 'stats', 'stats', 'algebra', 'bio', 'bio'], - 'Participated': ['yes','yes','yes','yes','no','yes','yes','yes','yes','yes'], - 'Passed': ['yes' if x > 50 else 'no' for x in grades], - 'Employed': [True,True,True,False,False,False,False,True,True,False], - 'Grade': grades}) + grades = [48, 99, 75, 80, 42, 80, 72, 68, 36, 78] + df = pd.DataFrame({'ID': ["x%d" % r for r in range(10)], + 'Gender': ['F', 'M', 'F', 'M', 'F', + 'M', 'F', 'M', 'M', 'M'], + 'ExamYear': ['2007', '2007', '2007', '2008', '2008', + '2008', '2008', '2009', '2009', '2009'], + 'Class': ['algebra', 'stats', 'bio', 'algebra', + 'algebra', 'stats', 'stats', 'algebra', + 'bio', 'bio'], + 'Participated': ['yes', 'yes', 'yes', 'yes', 'no', + 'yes', 'yes', 'yes', 'yes', 'yes'], + 'Passed': ['yes' if x > 50 else 'no' for x in grades], + 'Employed': [True, True, True, False, + False, False, False, True, True, False], + 'Grade': grades}) df.groupby('ExamYear').agg({'Participated': lambda x: x.value_counts()['yes'], - 'Passed': lambda x: sum(x == 'yes'), - 'Employed' : lambda x : sum(x), - 'Grade' : lambda x : sum(x) / len(x)}) + 'Passed': lambda x: sum(x == 'yes'), + 'Employed': lambda x: sum(x), + 'Grade': lambda x: sum(x) / len(x)}) `Plot pandas DataFrame with year over year data `__ @@ -680,12 +736,15 @@ Apply .. ipython:: python - df = pd.DataFrame(data={'A' : [[2,4,8,16],[100,200],[10,20,30]], 'B' : [['a','b','c'],['jj','kk'],['ccc']]},index=['I','II','III']) + df = pd.DataFrame(data={'A': [[2, 4, 8, 16], [100, 200], [10, 20, 30]], + 'B': [['a', 'b', 'c'], ['jj', 'kk'], ['ccc']]}, + index=['I', 'II', 'III']) def SeriesFromSubList(aList): - return pd.Series(aList) + return pd.Series(aList) - df_orgz = pd.concat(dict([ (ind,row.apply(SeriesFromSubList)) for ind,row in df.iterrows() ])) + df_orgz = pd.concat({[(ind, row.apply(SeriesFromSubList)) + for ind, row in df.iterrows()]}) `Rolling Apply with a DataFrame returning a Series `__ @@ -694,15 +753,18 @@ Rolling Apply to multiple columns where function calculates a Series before a Sc .. ipython:: python - df = pd.DataFrame(data=np.random.randn(2000,2)/10000, - index=pd.date_range('2001-01-01',periods=2000), - columns=['A','B']); df + df = pd.DataFrame(data=np.random.randn(2000, 2) / 10000, + index=pd.date_range('2001-01-01', periods=2000), + columns=['A', 'B']) + df - def gm(aDF,Const): - v = ((((aDF.A+aDF.B)+1).cumprod())-1)*Const - return (aDF.index[0],v.iloc[-1]) + def gm(aDF, Const): + v = ((((aDF.A + aDF.B) + 1).cumprod()) - 1) * Const + return (aDF.index[0], v.iloc[-1]) - S = pd.Series(dict([ gm(df.iloc[i:min(i+51,len(df)-1)],5) for i in range(len(df)-50) ])); S + S = pd.Series({[gm(df.iloc[i:min(i + 51, len(df) - 1)], 5) + for i in range(len(df) - 50)]}) + S `Rolling apply with a DataFrame returning a Scalar `__ @@ -711,14 +773,20 @@ Rolling Apply to multiple columns where function returns a Scalar (Volume Weight .. ipython:: python - rng = pd.date_range(start = '2014-01-01',periods = 100) - df = pd.DataFrame({'Open' : np.random.randn(len(rng)), - 'Close' : np.random.randn(len(rng)), - 'Volume' : np.random.randint(100,2000,len(rng))}, index=rng); df + rng = pd.date_range(start='2014-01-01', periods=100) + df = pd.DataFrame({'Open': np.random.randn(len(rng)), + 'Close': np.random.randn(len(rng)), + 'Volume': np.random.randint(100, 2000, len(rng))}, + index=rng) + df - def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()) + def vwap(bars): + return ((bars.Close * bars.Volume).sum() / bars.Volume.sum()) window = 5 - s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(len(df)-window) ]); + s = pd.concat([(pd.Series(vwap(df.iloc[i:i + window]), + index=[df.index[i + window]])) + for i in range(len(df) - window)]) + s.round(2) s.round(2) Timeseries @@ -806,21 +874,25 @@ Depending on df construction, ``ignore_index`` may be needed .. ipython:: python - df = df1.append(df2,ignore_index=True); df + df = df1.append(df2, ignore_index=True) + df `Self Join of a DataFrame `__ .. ipython:: python - df = pd.DataFrame(data={'Area' : ['A'] * 5 + ['C'] * 2, - 'Bins' : [110] * 2 + [160] * 3 + [40] * 2, - 'Test_0' : [0, 1, 0, 1, 2, 0, 1], - 'Data' : np.random.randn(7)});df + df = pd.DataFrame(data={'Area': ['A'] * 5 + ['C'] * 2, + 'Bins': [110] * 2 + [160] * 3 + [40] * 2, + 'Test_0': [0, 1, 0, 1, 2, 0, 1], + 'Data': np.random.randn(7)}) + df df['Test_1'] = df['Test_0'] - 1 - pd.merge(df, df, left_on=['Bins', 'Area','Test_0'], right_on=['Bins', 'Area','Test_1'],suffixes=('_L','_R')) + pd.merge(df, df, left_on=['Bins', 'Area', 'Test_0'], + right_on=['Bins', 'Area', 'Test_1'], + suffixes=('_L', '_R')) `How to set the index and join `__ @@ -871,8 +943,8 @@ The :ref:`Plotting ` docs. .. ipython:: python df = pd.DataFrame( - {u'stratifying_var': np.random.uniform(0, 100, 20), - u'price': np.random.normal(100, 5, 20)}) + {u'stratifying_var': np.random.uniform(0, 100, 20), + u'price': np.random.normal(100, 5, 20)}) df[u'quartiles'] = pd.qcut( df[u'stratifying_var'], @@ -951,7 +1023,6 @@ You can use the same approach to read all files matching a pattern. Here is an .. ipython:: python - import glob files = glob.glob('file_*.csv') result = pd.concat([pd.read_csv(f) for f in files], ignore_index=True) @@ -970,9 +1041,9 @@ Parsing date components in multi-columns is faster with a format .. code-block:: ipython - In [30]: i = pd.date_range('20000101',periods=10000) + In [30]: i = pd.date_range('20000101', periods=10000) - In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day)) + In [31]: df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) In [32]: df.head() Out[32]: @@ -983,11 +1054,12 @@ Parsing date components in multi-columns is faster with a format 3 4 1 2000 4 5 1 2000 - In [33]: %timeit pd.to_datetime(df.year*10000+df.month*100+df.day,format='%Y%m%d') - 100 loops, best of 3: 7.08 ms per loop + In [33]: %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') + 4.8 ms ± 23.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) # simulate combinging into a string, then parsing - In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'],x['month'],x['day']),axis=1) + In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], + x['month'], x['day']), axis=1) In [35]: ds.head() Out[35]: @@ -999,6 +1071,7 @@ Parsing date components in multi-columns is faster with a format dtype: object In [36]: %timeit pd.to_datetime(ds) + Out[36]: 1 loops, best of 3: 488 ms per loop Skip row between header and data @@ -1032,8 +1105,8 @@ Option 1: pass rows explicitly to skip rows .. ipython:: python - pd.read_csv(StringIO(data), sep=';', skiprows=[11,12], - index_col=0, parse_dates=True, header=10) + pd.read_csv(StringIO(data), sep=';', skiprows=[11, 12], + index_col=0, parse_dates=True, header=10) Option 2: read column names and then data """"""""""""""""""""""""""""""""""""""""" @@ -1138,12 +1211,12 @@ Storing Attributes to a group node .. ipython:: python - df = pd.DataFrame(np.random.randn(8,3)) + df = pd.DataFrame(np.random.randn(8, 3)) store = pd.HDFStore('test.h5') - store.put('df',df) + store.put('df', df) # you can store an arbitrary Python object via pickle - store.get_storer('df').attrs.my_attribute = dict(A = 10) + store.get_storer('df').attrs.my_attribute = {'A': 10} store.get_storer('df').attrs.my_attribute .. ipython:: python @@ -1267,6 +1340,7 @@ The `method` argument within `DataFrame.corr` can accept a callable in addition ... return cov_ab / std_a / std_b ... ... + ... >>> df = pd.DataFrame(np.random.normal(size=(100, 3))) ... >>> df.corr(method=distcorr) @@ -1285,17 +1359,17 @@ The :ref:`Timedeltas ` docs. .. ipython:: python - s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) + s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) s - s.max() s.max() - s - s - datetime.datetime(2011,1,1,3,5) + s - datetime.datetime(2011, 1, 1, 3, 5) s + datetime.timedelta(minutes=5) - datetime.datetime(2011,1,1,3,5) - s + datetime.datetime(2011, 1, 1, 3, 5) - s datetime.timedelta(minutes=5) + s @@ -1304,13 +1378,15 @@ The :ref:`Timedeltas ` docs. .. ipython:: python - deltas = pd.Series([ datetime.timedelta(days=i) for i in range(3) ]) + deltas = pd.Series([datetime.timedelta(days=i) for i in range(3)]) - df = pd.DataFrame(dict(A = s, B = deltas)); df + df = pd.DataFrame({'A': s, 'B': deltas}) + df - df['New Dates'] = df['A'] + df['B']; + df['New Dates'] = df['A'] + df['B'] - df['Delta'] = df['A'] - df['New Dates']; df + df['Delta'] = df['A'] - df['New Dates'] + df df.dtypes @@ -1321,9 +1397,11 @@ Values can be set to NaT using np.nan, similar to datetime .. ipython:: python - y = s - s.shift(); y + y = s - s.shift() + y - y[1] = np.nan; y + y[1] = np.nan + y Aliasing Axis Names ------------------- @@ -1333,23 +1411,24 @@ To globally provide aliases for axis names, one can define these 2 functions: .. ipython:: python def set_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES[alias] = axis + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES[alias] = axis .. ipython:: python def clear_axis_alias(cls, axis, alias): - if axis not in cls._AXIS_NUMBERS: - raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) - cls._AXIS_ALIASES.pop(alias,None) + if axis not in cls._AXIS_NUMBERS: + raise Exception("invalid axis [%s] for alias [%s]" % (axis, alias)) + cls._AXIS_ALIASES.pop(alias, None) .. ipython:: python - set_axis_alias(pd.DataFrame,'columns', 'myaxis2') - df2 = pd.DataFrame(np.random.randn(3,2),columns=['c1','c2'],index=['i1','i2','i3']) + set_axis_alias(pd.DataFrame, 'columns', 'myaxis2') + df2 = pd.DataFrame(np.random.randn(3, 2), columns=['c1', 'c2'], + index=['i1', 'i2', 'i3']) df2.sum(axis='myaxis2') - clear_axis_alias(pd.DataFrame,'columns', 'myaxis2') + clear_axis_alias(pd.DataFrame, 'columns', 'myaxis2') Creating Example Data --------------------- @@ -1362,11 +1441,11 @@ of the data values: def expand_grid(data_dict): - rows = itertools.product(*data_dict.values()) - return pd.DataFrame.from_records(rows, columns=data_dict.keys()) + rows = itertools.product(*data_dict.values()) + return pd.DataFrame.from_records(rows, columns=data_dict.keys()) df = expand_grid( - {'height': [60, 70], - 'weight': [100, 140, 180], - 'sex': ['Male', 'Female']}) + {'height': [60, 70], + 'weight': [100, 140, 180], + 'sex': ['Male', 'Female']}) df From c3c26ea9ad43aa0975281a94fdada44eca64ef1d Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 09:54:52 +0530 Subject: [PATCH 2/3] compatible wiht PEP-8 standard --- doc/source/cookbook.rst | 45 ++++++++++------------------------------- 1 file changed, 11 insertions(+), 34 deletions(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index b73358c67b1ab..3a4fa4ad518fb 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1039,40 +1039,17 @@ Parsing date components in multi-columns Parsing date components in multi-columns is faster with a format -.. code-block:: ipython - - In [30]: i = pd.date_range('20000101', periods=10000) - - In [31]: df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) - - In [32]: df.head() - Out[32]: - day month year - 0 1 1 2000 - 1 2 1 2000 - 2 3 1 2000 - 3 4 1 2000 - 4 5 1 2000 - - In [33]: %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, format='%Y%m%d') - 4.8 ms ± 23.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - # simulate combinging into a string, then parsing - In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], - x['month'], x['day']), axis=1) - - In [35]: ds.head() - Out[35]: - 0 20000101 - 1 20000102 - 2 20000103 - 3 20000104 - 4 20000105 - dtype: object - - In [36]: %timeit pd.to_datetime(ds) - Out[36]: - 1 loops, best of 3: 488 ms per loop +.. ipython:: python + i = pd.date_range('20000101', periods=10000) + df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day}) + df.head() + %timeit pd.to_datetime(df.year * 10000 + df.month * 100 + df.day, + format='%Y%m%d') + ds = df.apply(lambda x: "%04d%02d%02d" % (x['year'], + x['month'], x['day']), axis=1) + + ds.head() + %timeit pd.to_datetime(ds) Skip row between header and data ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 27b32452358bf6555be39753b73c3b07ce695e9b Mon Sep 17 00:00:00 2001 From: saurav Date: Wed, 21 Nov 2018 10:54:36 +0530 Subject: [PATCH 3/3] DOC: compatible wiht PEP-8 standard --- doc/source/cookbook.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/cookbook.rst b/doc/source/cookbook.rst index 3a4fa4ad518fb..bd2b245adb3f5 100644 --- a/doc/source/cookbook.rst +++ b/doc/source/cookbook.rst @@ -1038,7 +1038,7 @@ Parsing date components in multi-columns ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Parsing date components in multi-columns is faster with a format - + .. ipython:: python i = pd.date_range('20000101', periods=10000) df = pd.DataFrame({'year': i.year, 'month': i.month, 'day': i.day})