Pandas Exploration#
[1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
[2]:
iris = load_iris()
[3]:
featureColumns = [i.replace(" ","").replace("(cm)","") for i in iris.feature_names]
df = pd.DataFrame(iris.data,columns=featureColumns)
df['target'] = iris.target
df.head()
[3]:
sepallength | sepalwidth | petallength | petalwidth | target | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
[4]:
## Single filter
df[df['sepallength'] < 5].head()
[4]:
sepallength | sepalwidth | petallength | petalwidth | target | |
---|---|---|---|---|---|
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
6 | 4.6 | 3.4 | 1.4 | 0.3 | 0 |
8 | 4.4 | 2.9 | 1.4 | 0.2 | 0 |
[5]:
## applying 2 filters
df[(df['sepallength'] < 5) & (df['target'].isin([0,1]))].head()
[5]:
sepallength | sepalwidth | petallength | petalwidth | target | |
---|---|---|---|---|---|
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
6 | 4.6 | 3.4 | 1.4 | 0.3 | 0 |
8 | 4.4 | 2.9 | 1.4 | 0.2 | 0 |
Transforming Data#
[6]:
df = pd.DataFrame({
'temperature' : pd.Series(23 + 10*np.random.randn(11)),
'thunderstorm' : pd.Series(150 + 10*np.random.randn(11)),
'location' : list('XXYYXXYYXXY')
})
df.head()
[6]:
temperature | thunderstorm | location | |
---|---|---|---|
0 | 37.178703 | 151.250130 | X |
1 | 16.338412 | 148.930679 | X |
2 | 16.614139 | 151.664537 | Y |
3 | 35.818557 | 154.044738 | Y |
4 | 21.598561 | 143.369174 | X |
[7]:
replaceValues = {
'location' : {
"X" : "MISSISSIPPI",
"Y" : "MANALI"
}
}
df = df.replace(replaceValues,regex=True)
df.head()
[7]:
temperature | thunderstorm | location | |
---|---|---|---|
0 | 37.178703 | 151.250130 | MISSISSIPPI |
1 | 16.338412 | 148.930679 | MISSISSIPPI |
2 | 16.614139 | 151.664537 | MANALI |
3 | 35.818557 | 154.044738 | MANALI |
4 | 21.598561 | 143.369174 | MISSISSIPPI |
[8]:
# df.location.str.contains("ISSI")
df.loc[df.location.str.contains("ISSI")]
[8]:
temperature | thunderstorm | location | |
---|---|---|---|
0 | 37.178703 | 151.250130 | MISSISSIPPI |
1 | 16.338412 | 148.930679 | MISSISSIPPI |
4 | 21.598561 | 143.369174 | MISSISSIPPI |
5 | 29.470110 | 141.694351 | MISSISSIPPI |
8 | 35.765885 | 144.513669 | MISSISSIPPI |
9 | 27.894740 | 156.470016 | MISSISSIPPI |
[9]:
df.groupby('location').mean()
[9]:
temperature | thunderstorm | |
---|---|---|
location | ||
MANALI | 29.238917 | 155.537351 |
MISSISSIPPI | 28.041068 | 147.704670 |
[10]:
import pandas as pd
import numpy as np
serIndex = ['s1','s2','s3','s4','s5']
heights_A = pd.Series(np.array([176.2, 158.4, 167.6, 156.2,161.4]),index=serIndex)
weights_A = pd.Series(np.array([85.1, 90.2, 76.8, 80.4,78.9]),index=serIndex)
df_A = pd.DataFrame()
df_A['Student_height'] = heights_A
df_A['Student_weight'] = weights_A
df_A['Gender'] = ['M','F','M','M','F']
s = pd.Series(np.array([165.4, 82.7, 'F']),index=['Student_height', 'Student_weight', 'Gender'])
s.name = 's6'
df_AA = df_A.append(s)
# print(df_AA)
np.random.seed(100)
heights_B = pd.Series(np.random.normal(loc=170.0,scale=25,size=5))
np.random.seed(100)
weights_B = pd.Series(np.random.normal(loc=75.0,scale=12.0,size=5))
df_B = pd.DataFrame()
df_B['Student_height'] = heights_B
df_B['Student_weight'] = weights_B
df_B.index = ['s7','s8','s9','s10','s11']
df_B['Gender'] = ['F','M','F','F','M']
pd.concat([df_AA,df_B])
[10]:
Student_height | Student_weight | Gender | |
---|---|---|---|
s1 | 176.2 | 85.1 | M |
s2 | 158.4 | 90.2 | F |
s3 | 167.6 | 76.8 | M |
s4 | 156.2 | 80.4 | M |
s5 | 161.4 | 78.9 | F |
s6 | 165.4 | 82.7 | F |
s7 | 126.256 | 54.0028 | F |
s8 | 178.567 | 79.1122 | M |
s9 | 198.826 | 88.8364 | F |
s10 | 163.689 | 71.9708 | F |
s11 | 194.533 | 86.7758 | M |
[13]:
s = pd.Series([89.2, 76.4, 98.2, 75.9], index=list('abcd'))
'b' in s
[13]:
True
[ ]: