This website works better with desktop in both themes, for mobile devices please change to light theme.

Pandas Exploration

Pandas Exploration#

[1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
[2]:
iris = load_iris()
[3]:
featureColumns = [i.replace(" ","").replace("(cm)","") for i in iris.feature_names]
df = pd.DataFrame(iris.data,columns=featureColumns)
df['target'] = iris.target

df.head()
[3]:
sepallength sepalwidth petallength petalwidth target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
[4]:
## Single filter
df[df['sepallength'] < 5].head()
[4]:
sepallength sepalwidth petallength petalwidth target
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
6 4.6 3.4 1.4 0.3 0
8 4.4 2.9 1.4 0.2 0
[5]:
## applying 2 filters

df[(df['sepallength'] < 5) & (df['target'].isin([0,1]))].head()

[5]:
sepallength sepalwidth petallength petalwidth target
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
6 4.6 3.4 1.4 0.3 0
8 4.4 2.9 1.4 0.2 0

Transforming Data#

[6]:
df = pd.DataFrame({
    'temperature' : pd.Series(23 + 10*np.random.randn(11)),
    'thunderstorm' : pd.Series(150 + 10*np.random.randn(11)),
    'location' : list('XXYYXXYYXXY')

})

df.head()
[6]:
temperature thunderstorm location
0 37.178703 151.250130 X
1 16.338412 148.930679 X
2 16.614139 151.664537 Y
3 35.818557 154.044738 Y
4 21.598561 143.369174 X
[7]:
replaceValues = {
    'location' : {
        "X" : "MISSISSIPPI",
        "Y" : "MANALI"
    }
}

df = df.replace(replaceValues,regex=True)
df.head()
[7]:
temperature thunderstorm location
0 37.178703 151.250130 MISSISSIPPI
1 16.338412 148.930679 MISSISSIPPI
2 16.614139 151.664537 MANALI
3 35.818557 154.044738 MANALI
4 21.598561 143.369174 MISSISSIPPI
[8]:
# df.location.str.contains("ISSI")
df.loc[df.location.str.contains("ISSI")]
[8]:
temperature thunderstorm location
0 37.178703 151.250130 MISSISSIPPI
1 16.338412 148.930679 MISSISSIPPI
4 21.598561 143.369174 MISSISSIPPI
5 29.470110 141.694351 MISSISSIPPI
8 35.765885 144.513669 MISSISSIPPI
9 27.894740 156.470016 MISSISSIPPI
[9]:
df.groupby('location').mean()
[9]:
temperature thunderstorm
location
MANALI 29.238917 155.537351
MISSISSIPPI 28.041068 147.704670
[10]:
import pandas as pd
import numpy as np

serIndex = ['s1','s2','s3','s4','s5']

heights_A = pd.Series(np.array([176.2, 158.4, 167.6, 156.2,161.4]),index=serIndex)
weights_A = pd.Series(np.array([85.1, 90.2, 76.8, 80.4,78.9]),index=serIndex)


df_A = pd.DataFrame()
df_A['Student_height'] = heights_A
df_A['Student_weight'] = weights_A

df_A['Gender'] = ['M','F','M','M','F']

s = pd.Series(np.array([165.4, 82.7, 'F']),index=['Student_height', 'Student_weight', 'Gender'])
s.name = 's6'
df_AA = df_A.append(s)
# print(df_AA)

np.random.seed(100)


heights_B = pd.Series(np.random.normal(loc=170.0,scale=25,size=5))

np.random.seed(100)

weights_B = pd.Series(np.random.normal(loc=75.0,scale=12.0,size=5))

df_B = pd.DataFrame()
df_B['Student_height'] = heights_B
df_B['Student_weight'] = weights_B
df_B.index = ['s7','s8','s9','s10','s11']

df_B['Gender'] = ['F','M','F','F','M']

pd.concat([df_AA,df_B])
[10]:
Student_height Student_weight Gender
s1 176.2 85.1 M
s2 158.4 90.2 F
s3 167.6 76.8 M
s4 156.2 80.4 M
s5 161.4 78.9 F
s6 165.4 82.7 F
s7 126.256 54.0028 F
s8 178.567 79.1122 M
s9 198.826 88.8364 F
s10 163.689 71.9708 F
s11 194.533 86.7758 M
[13]:
s = pd.Series([89.2, 76.4, 98.2, 75.9], index=list('abcd'))

'b' in s
[13]:
True
[ ]: