Pandas working with Data Frames

In [9]:
from pandas import *
from numpy import *
In [12]:
df = DataFrame({
    'sp' : ['MM1', 'MM1', 'MM1', 'MM2', 'MM2', 'MM2', 'MM4', 'MM4', 'MM4'],
    'mt' : ['S1', 'S1', 'S3', 'S3', 'S4', 'S4', 'S2', 'S2', 'S2'],
    'val' : ['a', 'n', 'cb', 'mk', 'bg', 'dgb', 'rd', 'cb', 'uyi'],
    'count' : [3,2,5,8,10,1,2,2,7]
    })
In [4]:
df
Out[4]:
count mt sp val
0 3 S1 MM1 a
1 2 S1 MM1 n
2 5 S3 MM1 cb
3 8 S3 MM2 mk
4 10 S4 MM2 bg
5 1 S4 MM2 dgb
6 2 S2 MM4 rd
7 2 S2 MM4 cb
8 7 S2 MM4 uyi
In [5]:
df_grouped = df.groupby(['sp', 'mt']).agg({'count':'max'})
In [6]:
df_grouped
Out[6]:
count
sp mt
MM1 S1 3
S3 5
MM2 S3 8
S4 10
MM4 S2 7
In [7]:
df_grouped = df_grouped.reset_index()
In [8]:
df_grouped = df_grouped.rename(columns={'count':'count_max'})
In [10]:
df = merge(df, df_grouped, how='left', on=['sp', 'mt'])
In [11]:
df
Out[11]:
count mt sp val count_max
0 3 S1 MM1 a 3
1 2 S1 MM1 n 3
2 5 S3 MM1 cb 5
3 8 S3 MM2 mk 8
4 10 S4 MM2 bg 10
5 1 S4 MM2 dgb 10
6 2 S2 MM4 rd 7
7 2 S2 MM4 cb 7
8 7 S2 MM4 uyi 7
In [12]:
df = df[df['count'] == df['count_max']]
In [13]:
df
Out[13]:
count mt sp val count_max
0 3 S1 MM1 a 3
2 5 S3 MM1 cb 5
3 8 S3 MM2 mk 8
4 10 S4 MM2 bg 10
8 7 S2 MM4 uyi 7
In [19]:
df_grouped = df.groupby(['sp','mt']).agg({'count':'max'})
In [20]:
df_grouped
Out[20]:
count
sp mt
MM1 S1 3
S3 5
MM2 S3 8
S4 10
MM4 S2 7

df_g2=df_grouped(['mt','count']).agg({'cnt2':'max'})

df_g2

In [21]:
df_grouped.rename(columns={'count':'count_max'})
Out[21]:
count_max
sp mt
MM1 S1 3
S3 5
MM2 S3 8
S4 10
MM4 S2 7
In [21]:
#A little Excel type stuff for Pandas

#Add a Row num 
df['count'] = pandas.Series(range(len(df)))

#Add up a running total
df['running total'] = df['count'].cumsum()

#Difference from a scalar 
df['diff'] = 2014 - df['count']

#Moving Average
df['moving average'] = df['running total'] / df['count'].astype('float')

#Perform If type statements  (If 'moving average > 3' then 1)
df['new column'] = 0
mask = df['moving average'] >= 3
df.loc[mask, 'new column'] = 1
In [22]:
df
Out[22]:
count mt sp val running total diff moving average new column
0 0 S1 MM1 a 0 2014 NaN 0
1 1 S1 MM1 n 1 2013 1.0 0
2 2 S3 MM1 cb 3 2012 1.5 0
3 3 S3 MM2 mk 6 2011 2.0 0
4 4 S4 MM2 bg 10 2010 2.5 0
5 5 S4 MM2 dgb 15 2009 3.0 1
6 6 S2 MM4 rd 21 2008 3.5 1
7 7 S2 MM4 cb 28 2007 4.0 1
8 8 S2 MM4 uyi 36 2006 4.5 1
In [ ]: