Data Frame Manipulation
import pandas as pd
data={'col1':[1,2,3],'col2':[3,2,1],'col3':[5,5,5]}
df=pd.DataFrame(data)
df
|
col1 |
col2 |
col3 |
0 |
1 |
3 |
5 |
1 |
2 |
2 |
5 |
2 |
3 |
1 |
5 |
Create New Column using a Function
# create a function called times100
def TestValue(x):
if x>2:
return True
else:
return False
df['bigger_than_2_by_function']=df['col1'].map(TestValue)
df
|
col1 |
col2 |
col3 |
bigger_than_2_by_function |
0 |
1 |
3 |
5 |
False |
1 |
2 |
2 |
5 |
False |
2 |
3 |
1 |
5 |
True |
Create New Column directly
df['bigger_than_2']=df['col1']>2
df
|
col1 |
col2 |
col3 |
bigger_than_2_by_function |
bigger_than_2 |
0 |
1 |
3 |
5 |
False |
False |
1 |
2 |
2 |
5 |
False |
False |
2 |
3 |
1 |
5 |
True |
True |
Column basic maths
df['added']=df['col1']+df['col2']+df['col3']
df
|
col1 |
col2 |
col3 |
bigger_than_2_by_function |
bigger_than_2 |
added |
0 |
1 |
3 |
5 |
False |
False |
9 |
1 |
2 |
2 |
5 |
False |
False |
9 |
2 |
3 |
1 |
5 |
True |
True |
9 |
df['other_maths']=df['col1']*df['col2']/df['col3']
df
|
col1 |
col2 |
col3 |
bigger_than_2_by_function |
bigger_than_2 |
added |
other_maths |
0 |
1 |
3 |
5 |
False |
False |
9 |
0.6 |
1 |
2 |
2 |
5 |
False |
False |
9 |
0.8 |
2 |
3 |
1 |
5 |
True |
True |
9 |
0.6 |
Merge/Join
I deliberatly created 2 datatables with different key names, but matching key_values. This makes the pandas a little more difficult.
import pandas as pd
people={'id':[1,2,3,4],'name':['Tom','Bob','Harry','Frank']}
jobs={'people_id':[1,2,4],'job_desc':['Web','Db','C++']}
p_df = pd.DataFrame(people)
j_df = pd.DataFrame(jobs)
p_df
|
id |
name |
0 |
1 |
Tom |
1 |
2 |
Bob |
2 |
3 |
Harry |
3 |
4 |
Frank |
j_df
|
job_desc |
people_id |
0 |
Web |
1 |
1 |
Db |
2 |
2 |
C++ |
4 |
#Inner Join
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how=inner)
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
1 |
1 |
2 |
Bob |
Db |
2 |
2 |
4 |
Frank |
C++ |
4 |
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='outer')
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
1.0 |
1 |
2 |
Bob |
Db |
2.0 |
2 |
3 |
Harry |
NaN |
NaN |
3 |
4 |
Frank |
C++ |
4.0 |
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='left')
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
1.0 |
1 |
2 |
Bob |
Db |
2.0 |
2 |
3 |
Harry |
NaN |
NaN |
3 |
4 |
Frank |
C++ |
4.0 |
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='right')
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
1 |
1 |
2 |
Bob |
Db |
2 |
2 |
4 |
Frank |
C++ |
4 |
Cleaning up Certain Values
import pandas as pd
people={'id':[1,2,3,4],'name':['Tom','Bob','Harry','Frank']}
jobs={'people_id':[1,2,4],'job_desc':['Web','Db','C++']}
p_df = pd.DataFrame(people)
j_df = pd.DataFrame(jobs)
pj_df=pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='outer')
pj_df
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
1.0 |
1 |
2 |
Bob |
Db |
2.0 |
2 |
3 |
Harry |
NaN |
NaN |
3 |
4 |
Frank |
C++ |
4.0 |
#Need to fix the job_desc and the people_id which is NaN
#But they are different types - So I will do each one seperatly
pj_df['job_desc'].notnull()
#This index then is used with the ['job_desc'] Column so where only acts in False values, and we set the value to ""
0 True
1 True
2 False
3 True
Name: job_desc, dtype: bool
pj_df['job_desc']=pj_df['job_desc'].where(pj_df['job_desc'].notnull(),"")
pj_df
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
1.0 |
1 |
2 |
Bob |
Db |
2.0 |
2 |
3 |
Harry |
|
NaN |
3 |
4 |
Frank |
C++ |
4.0 |
pj_df['people_id']=pj_df['job_desc'].where(pj_df['people_id'].notnull(),0)
pj_df
|
id |
name |
job_desc |
people_id |
0 |
1 |
Tom |
Web |
Web |
1 |
2 |
Bob |
Db |
Db |
2 |
3 |
Harry |
|
0 |
3 |
4 |
Frank |
C++ |
C++ |
Visualise your data
%matplotlib inline
import pandas as pd
ship=pd.DataFrame.from_csv('train.csv')
#Remove all NaN
ship['Age']= ship['Age'].where(ship['Age'].notnull(),0)
ship['Cabin']= ship['Cabin'].where(ship['Cabin'].notnull(),"")
ship['Embarked']= ship['Embarked'].where(ship['Embarked'].notnull(),"")
#Round fares to £2 "chunks"
def chunk_fare(x):
x=int(x/2)*2
return x
from matplotlib import pyplot as plt
rounded_fare=ship['Fare'].map(chunk_fare)
unique_fares=rounded_fare.unique()
unique_fares.sort()
rf=pd.DataFrame(rounded_fare)
fare_counts=rf.groupby(by='Fare').size()
plt.bar(unique_fares,fare_counts)
#This shows the range is too Great as there were supposed to be 1/2/3 tyoes of Tickets
<Container object of 61 artists>

def chunk_fare_v2(x):
if x<14:
return 3
elif x<50:
return 2
else:
return 1
rounded_fare=ship['Fare'].map(chunk_fare_v2)
unique_fares=rounded_fare.unique()
unique_fares.sort()
rf=pd.DataFrame(rounded_fare)
fare_counts=rf.groupby(by='Fare').size()
plt.title("Titanic ticket class based on price")
plt.bar(unique_fares,fare_counts)
<Container object of 3 artists>

ship['Ticket_Cost']=ship['Fare'].map(chunk_fare_v2)
ship
|
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
Class |
PassengerId |
|
|
|
|
|
|
|
|
|
|
|
|
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
|
S |
3 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
1 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
|
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
1 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
|
S |
3 |
6 |
0 |
3 |
Moran, Mr. James |
male |
0.0 |
0 |
0 |
330877 |
8.4583 |
|
Q |
3 |
7 |
0 |
1 |
McCarthy, Mr. Timothy J |
male |
54.0 |
0 |
0 |
17463 |
51.8625 |
E46 |
S |
1 |
8 |
0 |
3 |
Palsson, Master. Gosta Leonard |
male |
2.0 |
3 |
1 |
349909 |
21.0750 |
|
S |
2 |
9 |
1 |
3 |
Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) |
female |
27.0 |
0 |
2 |
347742 |
11.1333 |
|
S |
3 |
10 |
1 |
2 |
Nasser, Mrs. Nicholas (Adele Achem) |
female |
14.0 |
1 |
0 |
237736 |
30.0708 |
|
C |
2 |
11 |
1 |
3 |
Sandstrom, Miss. Marguerite Rut |
female |
4.0 |
1 |
1 |
PP 9549 |
16.7000 |
G6 |
S |
2 |
12 |
1 |
1 |
Bonnell, Miss. Elizabeth |
female |
58.0 |
0 |
0 |
113783 |
26.5500 |
C103 |
S |
2 |
13 |
0 |
3 |
Saundercock, Mr. William Henry |
male |
20.0 |
0 |
0 |
A/5. 2151 |
8.0500 |
|
S |
3 |
14 |
0 |
3 |
Andersson, Mr. Anders Johan |
male |
39.0 |
1 |
5 |
347082 |
31.2750 |
|
S |
2 |
15 |
0 |
3 |
Vestrom, Miss. Hulda Amanda Adolfina |
female |
14.0 |
0 |
0 |
350406 |
7.8542 |
|
S |
3 |
16 |
1 |
2 |
Hewlett, Mrs. (Mary D Kingcome) |
female |
55.0 |
0 |
0 |
248706 |
16.0000 |
|
S |
2 |
17 |
0 |
3 |
Rice, Master. Eugene |
male |
2.0 |
4 |
1 |
382652 |
29.1250 |
|
Q |
2 |
18 |
1 |
2 |
Williams, Mr. Charles Eugene |
male |
0.0 |
0 |
0 |
244373 |
13.0000 |
|
S |
3 |
19 |
0 |
3 |
Vander Planke, Mrs. Julius (Emelia Maria Vande... |
female |
31.0 |
1 |
0 |
345763 |
18.0000 |
|
S |
2 |
20 |
1 |
3 |
Masselmani, Mrs. Fatima |
female |
0.0 |
0 |
0 |
2649 |
7.2250 |
|
C |
3 |
21 |
0 |
2 |
Fynney, Mr. Joseph J |
male |
35.0 |
0 |
0 |
239865 |
26.0000 |
|
S |
2 |
22 |
1 |
2 |
Beesley, Mr. Lawrence |
male |
34.0 |
0 |
0 |
248698 |
13.0000 |
D56 |
S |
3 |
23 |
1 |
3 |
McGowan, Miss. Anna "Annie" |
female |
15.0 |
0 |
0 |
330923 |
8.0292 |
|
Q |
3 |
24 |
1 |
1 |
Sloper, Mr. William Thompson |
male |
28.0 |
0 |
0 |
113788 |
35.5000 |
A6 |
S |
2 |
25 |
0 |
3 |
Palsson, Miss. Torborg Danira |
female |
8.0 |
3 |
1 |
349909 |
21.0750 |
|
S |
2 |
26 |
1 |
3 |
Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... |
female |
38.0 |
1 |
5 |
347077 |
31.3875 |
|
S |
2 |
27 |
0 |
3 |
Emir, Mr. Farred Chehab |
male |
0.0 |
0 |
0 |
2631 |
7.2250 |
|
C |
3 |
28 |
0 |
1 |
Fortune, Mr. Charles Alexander |
male |
19.0 |
3 |
2 |
19950 |
263.0000 |
C23 C25 C27 |
S |
1 |
29 |
1 |
3 |
O'Dwyer, Miss. Ellen "Nellie" |
female |
0.0 |
0 |
0 |
330959 |
7.8792 |
|
Q |
3 |
30 |
0 |
3 |
Todoroff, Mr. Lalio |
male |
0.0 |
0 |
0 |
349216 |
7.8958 |
|
S |
3 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
862 |
0 |
2 |
Giles, Mr. Frederick Edward |
male |
21.0 |
1 |
0 |
28134 |
11.5000 |
|
S |
3 |
863 |
1 |
1 |
Swift, Mrs. Frederick Joel (Margaret Welles Ba... |
female |
48.0 |
0 |
0 |
17466 |
25.9292 |
D17 |
S |
2 |
864 |
0 |
3 |
Sage, Miss. Dorothy Edith "Dolly" |
female |
0.0 |
8 |
2 |
CA. 2343 |
69.5500 |
|
S |
1 |
865 |
0 |
2 |
Gill, Mr. John William |
male |
24.0 |
0 |
0 |
233866 |
13.0000 |
|
S |
3 |
866 |
1 |
2 |
Bystrom, Mrs. (Karolina) |
female |
42.0 |
0 |
0 |
236852 |
13.0000 |
|
S |
3 |
867 |
1 |
2 |
Duran y More, Miss. Asuncion |
female |
27.0 |
1 |
0 |
SC/PARIS 2149 |
13.8583 |
|
C |
3 |
868 |
0 |
1 |
Roebling, Mr. Washington Augustus II |
male |
31.0 |
0 |
0 |
PC 17590 |
50.4958 |
A24 |
S |
1 |
869 |
0 |
3 |
van Melkebeke, Mr. Philemon |
male |
0.0 |
0 |
0 |
345777 |
9.5000 |
|
S |
3 |
870 |
1 |
3 |
Johnson, Master. Harold Theodor |
male |
4.0 |
1 |
1 |
347742 |
11.1333 |
|
S |
3 |
871 |
0 |
3 |
Balkic, Mr. Cerin |
male |
26.0 |
0 |
0 |
349248 |
7.8958 |
|
S |
3 |
872 |
1 |
1 |
Beckwith, Mrs. Richard Leonard (Sallie Monypeny) |
female |
47.0 |
1 |
1 |
11751 |
52.5542 |
D35 |
S |
1 |
873 |
0 |
1 |
Carlsson, Mr. Frans Olof |
male |
33.0 |
0 |
0 |
695 |
5.0000 |
B51 B53 B55 |
S |
3 |
874 |
0 |
3 |
Vander Cruyssen, Mr. Victor |
male |
47.0 |
0 |
0 |
345765 |
9.0000 |
|
S |
3 |
875 |
1 |
2 |
Abelson, Mrs. Samuel (Hannah Wizosky) |
female |
28.0 |
1 |
0 |
P/PP 3381 |
24.0000 |
|
C |
2 |
876 |
1 |
3 |
Najib, Miss. Adele Kiamie "Jane" |
female |
15.0 |
0 |
0 |
2667 |
7.2250 |
|
C |
3 |
877 |
0 |
3 |
Gustafsson, Mr. Alfred Ossian |
male |
20.0 |
0 |
0 |
7534 |
9.8458 |
|
S |
3 |
878 |
0 |
3 |
Petroff, Mr. Nedelio |
male |
19.0 |
0 |
0 |
349212 |
7.8958 |
|
S |
3 |
879 |
0 |
3 |
Laleff, Mr. Kristo |
male |
0.0 |
0 |
0 |
349217 |
7.8958 |
|
S |
3 |
880 |
1 |
1 |
Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) |
female |
56.0 |
0 |
1 |
11767 |
83.1583 |
C50 |
C |
1 |
881 |
1 |
2 |
Shelley, Mrs. William (Imanita Parrish Hall) |
female |
25.0 |
0 |
1 |
230433 |
26.0000 |
|
S |
2 |
882 |
0 |
3 |
Markun, Mr. Johann |
male |
33.0 |
0 |
0 |
349257 |
7.8958 |
|
S |
3 |
883 |
0 |
3 |
Dahlberg, Miss. Gerda Ulrika |
female |
22.0 |
0 |
0 |
7552 |
10.5167 |
|
S |
3 |
884 |
0 |
2 |
Banfield, Mr. Frederick James |
male |
28.0 |
0 |
0 |
C.A./SOTON 34068 |
10.5000 |
|
S |
3 |
885 |
0 |
3 |
Sutehall, Mr. Henry Jr |
male |
25.0 |
0 |
0 |
SOTON/OQ 392076 |
7.0500 |
|
S |
3 |
886 |
0 |
3 |
Rice, Mrs. William (Margaret Norton) |
female |
39.0 |
0 |
5 |
382652 |
29.1250 |
|
Q |
2 |
887 |
0 |
2 |
Montvila, Rev. Juozas |
male |
27.0 |
0 |
0 |
211536 |
13.0000 |
|
S |
3 |
888 |
1 |
1 |
Graham, Miss. Margaret Edith |
female |
19.0 |
0 |
0 |
112053 |
30.0000 |
B42 |
S |
2 |
889 |
0 |
3 |
Johnston, Miss. Catherine Helen "Carrie" |
female |
0.0 |
1 |
2 |
W./C. 6607 |
23.4500 |
|
S |
2 |
890 |
1 |
1 |
Behr, Mr. Karl Howell |
male |
26.0 |
0 |
0 |
111369 |
30.0000 |
C148 |
C |
2 |
891 |
0 |
3 |
Dooley, Mr. Patrick |
male |
32.0 |
0 |
0 |
370376 |
7.7500 |
|
Q |
3 |
891 rows × 12 columns
BarGraphs to look at Data
%matplotlib inline
import pprint
import pandas as pd
ship=pd.DataFrame.from_csv('train.csv')
#Remove all NaN
ship['Age']= ship['Age'].where(ship['Age'].notnull(),0)
ship['Cabin']= ship['Cabin'].where(ship['Cabin'].notnull(),"")
ship['Embarked']= ship['Embarked'].where(ship['Embarked'].notnull(),"")
#Define some functions to help me look at the data
#Round fares to £2 "chunks"
def chunk_fare(x):
x=int(x/2)*2
return x
def chunk_fare_v2(x):
if x<14:
return 3
elif x<50:
return 2
else:
return 1
def age_function(x):
if x<14:
return 1
elif x<25:
return 2
elif x<45:
return 3
else:
return 4
#Function to look at 1 Column
def View_Col(col_data,function_to_call,title):
"""
Expects a Column of data
"""
new_col_data=col_data.map(function_to_call)
new_col_data.sort_values(inplace=True)
rf=pd.DataFrame(new_col_data)
rf.columns=['Data']
col_counts=rf.groupby(by='Data').size()
unique_cols=rf['Data'].unique()
plt.title=title
plt.bar(unique_cols,col_counts)
c=View_Col(ship['Age'],age_function,"Data Grouped by Age Function")

c=View_Col(ship['Fare'],chunk_fare,"Data Grouped by chuck_fare")

c=View_Col(ship['Fare'],chunk_fare_v2,"Data Grouped by chuck_fare")

Selecting data by columns
#Show only people with title's
titanic_df[titanic_df.title_code==1][titanic_df.survived==0]