machine learning

Data Frame Manipulation

import pandas as pd
data={'col1':[1,2,3],'col2':[3,2,1],'col3':[5,5,5]}

df=pd.DataFrame(data)

df
col1 col2 col3
0 1 3 5
1 2 2 5
2 3 1 5

Create New Column using a Function

# create a function called times100
def TestValue(x):
    if x>2:
        return True
    else:
        return False

df['bigger_than_2_by_function']=df['col1'].map(TestValue)
df
col1 col2 col3 bigger_than_2_by_function
0 1 3 5 False
1 2 2 5 False
2 3 1 5 True

Create New Column directly

df['bigger_than_2']=df['col1']>2
df
col1 col2 col3 bigger_than_2_by_function bigger_than_2
0 1 3 5 False False
1 2 2 5 False False
2 3 1 5 True True

Column basic maths

df['added']=df['col1']+df['col2']+df['col3']
df
col1 col2 col3 bigger_than_2_by_function bigger_than_2 added
0 1 3 5 False False 9
1 2 2 5 False False 9
2 3 1 5 True True 9
df['other_maths']=df['col1']*df['col2']/df['col3']
df
col1 col2 col3 bigger_than_2_by_function bigger_than_2 added other_maths
0 1 3 5 False False 9 0.6
1 2 2 5 False False 9 0.8
2 3 1 5 True True 9 0.6

Merge/Join

I deliberatly created 2 datatables with different key names, but matching key_values. This makes the pandas a little more difficult.

import pandas as pd

people={'id':[1,2,3,4],'name':['Tom','Bob','Harry','Frank']}
jobs={'people_id':[1,2,4],'job_desc':['Web','Db','C++']}
p_df = pd.DataFrame(people)
j_df = pd.DataFrame(jobs)
p_df
id name
0 1 Tom
1 2 Bob
2 3 Harry
3 4 Frank
j_df
job_desc people_id
0 Web 1
1 Db 2
2 C++ 4
#Inner Join
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how=inner)
id name job_desc people_id
0 1 Tom Web 1
1 2 Bob Db 2
2 4 Frank C++ 4
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='outer')
id name job_desc people_id
0 1 Tom Web 1.0
1 2 Bob Db 2.0
2 3 Harry NaN NaN
3 4 Frank C++ 4.0
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='left')
id name job_desc people_id
0 1 Tom Web 1.0
1 2 Bob Db 2.0
2 3 Harry NaN NaN
3 4 Frank C++ 4.0
pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='right')
id name job_desc people_id
0 1 Tom Web 1
1 2 Bob Db 2
2 4 Frank C++ 4

Cleaning up Certain Values

import pandas as pd
people={'id':[1,2,3,4],'name':['Tom','Bob','Harry','Frank']}
jobs={'people_id':[1,2,4],'job_desc':['Web','Db','C++']}
p_df = pd.DataFrame(people)
j_df = pd.DataFrame(jobs)
pj_df=pd.merge(p_df,j_df,left_on='id',right_on='people_id',how='outer')
pj_df
id name job_desc people_id
0 1 Tom Web 1.0
1 2 Bob Db 2.0
2 3 Harry NaN NaN
3 4 Frank C++ 4.0
#Need to fix the job_desc and the people_id which is NaN
#But they are different types - So I will do each one seperatly

pj_df['job_desc'].notnull()

#This index then is used with the ['job_desc'] Column so where only acts in False values, and we set the value to ""
0     True
1     True
2    False
3     True
Name: job_desc, dtype: bool
pj_df['job_desc']=pj_df['job_desc'].where(pj_df['job_desc'].notnull(),"")
pj_df
id name job_desc people_id
0 1 Tom Web 1.0
1 2 Bob Db 2.0
2 3 Harry NaN
3 4 Frank C++ 4.0
pj_df['people_id']=pj_df['job_desc'].where(pj_df['people_id'].notnull(),0)
pj_df
id name job_desc people_id
0 1 Tom Web Web
1 2 Bob Db Db
2 3 Harry 0
3 4 Frank C++ C++

Visualise your data

%matplotlib inline
import pandas as pd
ship=pd.DataFrame.from_csv('train.csv')
#Remove all NaN
ship['Age']=    ship['Age'].where(ship['Age'].notnull(),0)
ship['Cabin']=  ship['Cabin'].where(ship['Cabin'].notnull(),"")
ship['Embarked']=  ship['Embarked'].where(ship['Embarked'].notnull(),"")
#Round fares to £2 "chunks"
def chunk_fare(x):
    x=int(x/2)*2
    return x
from matplotlib import pyplot as plt
rounded_fare=ship['Fare'].map(chunk_fare)
unique_fares=rounded_fare.unique()
unique_fares.sort()
rf=pd.DataFrame(rounded_fare)
fare_counts=rf.groupby(by='Fare').size()
plt.bar(unique_fares,fare_counts)
#This shows the range is too Great as there were supposed to be 1/2/3 tyoes of Tickets
<Container object of 61 artists>

png

def chunk_fare_v2(x):
    if x<14:
        return 3
    elif x<50:
        return 2
    else:
        return 1
rounded_fare=ship['Fare'].map(chunk_fare_v2)
unique_fares=rounded_fare.unique()
unique_fares.sort()
rf=pd.DataFrame(rounded_fare)
fare_counts=rf.groupby(by='Fare').size()
plt.title("Titanic ticket class based on price")
plt.bar(unique_fares,fare_counts)
<Container object of 3 artists>

png

ship['Ticket_Cost']=ship['Fare'].map(chunk_fare_v2)
ship
Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Class
PassengerId
1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 S 3
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 1
3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 S 3
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 1
5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 S 3
6 0 3 Moran, Mr. James male 0.0 0 0 330877 8.4583 Q 3
7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S 1
8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 S 2
9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 S 3
10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 C 2
11 1 3 Sandstrom, Miss. Marguerite Rut female 4.0 1 1 PP 9549 16.7000 G6 S 2
12 1 1 Bonnell, Miss. Elizabeth female 58.0 0 0 113783 26.5500 C103 S 2
13 0 3 Saundercock, Mr. William Henry male 20.0 0 0 A/5. 2151 8.0500 S 3
14 0 3 Andersson, Mr. Anders Johan male 39.0 1 5 347082 31.2750 S 2
15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14.0 0 0 350406 7.8542 S 3
16 1 2 Hewlett, Mrs. (Mary D Kingcome) female 55.0 0 0 248706 16.0000 S 2
17 0 3 Rice, Master. Eugene male 2.0 4 1 382652 29.1250 Q 2
18 1 2 Williams, Mr. Charles Eugene male 0.0 0 0 244373 13.0000 S 3
19 0 3 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31.0 1 0 345763 18.0000 S 2
20 1 3 Masselmani, Mrs. Fatima female 0.0 0 0 2649 7.2250 C 3
21 0 2 Fynney, Mr. Joseph J male 35.0 0 0 239865 26.0000 S 2
22 1 2 Beesley, Mr. Lawrence male 34.0 0 0 248698 13.0000 D56 S 3
23 1 3 McGowan, Miss. Anna "Annie" female 15.0 0 0 330923 8.0292 Q 3
24 1 1 Sloper, Mr. William Thompson male 28.0 0 0 113788 35.5000 A6 S 2
25 0 3 Palsson, Miss. Torborg Danira female 8.0 3 1 349909 21.0750 S 2
26 1 3 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38.0 1 5 347077 31.3875 S 2
27 0 3 Emir, Mr. Farred Chehab male 0.0 0 0 2631 7.2250 C 3
28 0 1 Fortune, Mr. Charles Alexander male 19.0 3 2 19950 263.0000 C23 C25 C27 S 1
29 1 3 O'Dwyer, Miss. Ellen "Nellie" female 0.0 0 0 330959 7.8792 Q 3
30 0 3 Todoroff, Mr. Lalio male 0.0 0 0 349216 7.8958 S 3
... ... ... ... ... ... ... ... ... ... ... ... ...
862 0 2 Giles, Mr. Frederick Edward male 21.0 1 0 28134 11.5000 S 3
863 1 1 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48.0 0 0 17466 25.9292 D17 S 2
864 0 3 Sage, Miss. Dorothy Edith "Dolly" female 0.0 8 2 CA. 2343 69.5500 S 1
865 0 2 Gill, Mr. John William male 24.0 0 0 233866 13.0000 S 3
866 1 2 Bystrom, Mrs. (Karolina) female 42.0 0 0 236852 13.0000 S 3
867 1 2 Duran y More, Miss. Asuncion female 27.0 1 0 SC/PARIS 2149 13.8583 C 3
868 0 1 Roebling, Mr. Washington Augustus II male 31.0 0 0 PC 17590 50.4958 A24 S 1
869 0 3 van Melkebeke, Mr. Philemon male 0.0 0 0 345777 9.5000 S 3
870 1 3 Johnson, Master. Harold Theodor male 4.0 1 1 347742 11.1333 S 3
871 0 3 Balkic, Mr. Cerin male 26.0 0 0 349248 7.8958 S 3
872 1 1 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47.0 1 1 11751 52.5542 D35 S 1
873 0 1 Carlsson, Mr. Frans Olof male 33.0 0 0 695 5.0000 B51 B53 B55 S 3
874 0 3 Vander Cruyssen, Mr. Victor male 47.0 0 0 345765 9.0000 S 3
875 1 2 Abelson, Mrs. Samuel (Hannah Wizosky) female 28.0 1 0 P/PP 3381 24.0000 C 2
876 1 3 Najib, Miss. Adele Kiamie "Jane" female 15.0 0 0 2667 7.2250 C 3
877 0 3 Gustafsson, Mr. Alfred Ossian male 20.0 0 0 7534 9.8458 S 3
878 0 3 Petroff, Mr. Nedelio male 19.0 0 0 349212 7.8958 S 3
879 0 3 Laleff, Mr. Kristo male 0.0 0 0 349217 7.8958 S 3
880 1 1 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56.0 0 1 11767 83.1583 C50 C 1
881 1 2 Shelley, Mrs. William (Imanita Parrish Hall) female 25.0 0 1 230433 26.0000 S 2
882 0 3 Markun, Mr. Johann male 33.0 0 0 349257 7.8958 S 3
883 0 3 Dahlberg, Miss. Gerda Ulrika female 22.0 0 0 7552 10.5167 S 3
884 0 2 Banfield, Mr. Frederick James male 28.0 0 0 C.A./SOTON 34068 10.5000 S 3
885 0 3 Sutehall, Mr. Henry Jr male 25.0 0 0 SOTON/OQ 392076 7.0500 S 3
886 0 3 Rice, Mrs. William (Margaret Norton) female 39.0 0 5 382652 29.1250 Q 2
887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 S 3
888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 2
889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 0.0 1 2 W./C. 6607 23.4500 S 2
890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 2
891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 Q 3

891 rows × 12 columns


BarGraphs to look at Data

%matplotlib inline
import pprint
import pandas as pd
ship=pd.DataFrame.from_csv('train.csv')
#Remove all NaN
ship['Age']=    ship['Age'].where(ship['Age'].notnull(),0)
ship['Cabin']=  ship['Cabin'].where(ship['Cabin'].notnull(),"")
ship['Embarked']=  ship['Embarked'].where(ship['Embarked'].notnull(),"")
#Define some functions to help me look at the data
#Round fares to £2 "chunks"
def chunk_fare(x):
    x=int(x/2)*2
    return x

def chunk_fare_v2(x):
    if x<14:
        return 3
    elif x<50:
        return 2
    else:
        return 1

def age_function(x):
    if x<14:
        return 1
    elif x<25:
        return 2
    elif x<45:
        return 3
    else:
        return 4
#Function to look at 1 Column 
def View_Col(col_data,function_to_call,title):
    """
    Expects a Column of data
    """
    new_col_data=col_data.map(function_to_call)
    new_col_data.sort_values(inplace=True)
    rf=pd.DataFrame(new_col_data)
    rf.columns=['Data']
    col_counts=rf.groupby(by='Data').size()
    unique_cols=rf['Data'].unique()
    plt.title=title
    plt.bar(unique_cols,col_counts)


c=View_Col(ship['Age'],age_function,"Data Grouped by Age Function")

png

c=View_Col(ship['Fare'],chunk_fare,"Data Grouped by chuck_fare")

png

c=View_Col(ship['Fare'],chunk_fare_v2,"Data Grouped by chuck_fare")

png


Selecting data by columns

#Show only people with title's 
titanic_df[titanic_df.title_code==1][titanic_df.survived==0]