Lecture 7 - Functions, Pandas, Data Cleaning, and Preparation¶
In this lecture we will talk more about:
Functions in Python
Pandas Dataframes
Functions in Python¶
def doubleNum(nb):
num=nb*2
return num
number = 3
print("The double is", doubleNum(number))
The double is 6
number = 4
print("The double is", (lambda x: x*2)(number))
The double is 8
Pandas¶
This tutorial has been adapted from the creator of Pandas, Wes McKinney.
import numpy as np
import pandas as pd
# Remember to always import pandas
Data loading¶
Last time, and in Lab 3, we talked about loading data in Python using pandas.
Let’s talk now about how to clean and prepare data using Pandas.
Handling Missing Data¶
string_data = pd.DataFrame(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
0 | |
---|---|
0 | False |
1 | False |
2 | True |
3 | False |
string_data[0] = None
string_data
string_data.isnull()
0 | |
---|---|
0 | True |
1 | True |
2 | True |
3 | True |
Filtering Out Missing Data¶
from numpy import nan as NA
data = pd.DataFrame([1, NA, 3.5, NA, 7])
data
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 3.5 |
3 | NaN |
4 | 7.0 |
data.dropna()
0 | |
---|---|
0 | 1.0 |
2 | 3.5 |
4 | 7.0 |
data[data.notnull()]
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 3.5 |
3 | NaN |
4 | 7.0 |
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
cleaned
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
data.dropna(how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
data[4] = NA
data
0 | 1 | 2 | 4 | |
---|---|---|---|---|
0 | 1.0 | 6.5 | 3.0 | NaN |
1 | 1.0 | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 | NaN |
data.dropna(axis=1, how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
0 | 1 | 2 | |
---|---|---|---|
0 | -0.147671 | NaN | NaN |
1 | 0.808575 | NaN | NaN |
2 | -0.417914 | NaN | 1.339269 |
3 | -1.114100 | NaN | -1.934764 |
4 | -0.369869 | -0.100167 | 0.338057 |
5 | 1.358480 | -1.395844 | -2.302565 |
6 | 0.537828 | 0.146920 | -0.485762 |
df.dropna()
0 | 1 | 2 | |
---|---|---|---|
4 | -0.369869 | -0.100167 | 0.338057 |
5 | 1.358480 | -1.395844 | -2.302565 |
6 | 0.537828 | 0.146920 | -0.485762 |
#Keep only the rows with at least 2 non-NA values.
df.dropna(thresh=2)
0 | 1 | 2 | |
---|---|---|---|
2 | -0.417914 | NaN | 1.339269 |
3 | -1.114100 | NaN | -1.934764 |
4 | -0.369869 | -0.100167 | 0.338057 |
5 | 1.358480 | -1.395844 | -2.302565 |
6 | 0.537828 | 0.146920 | -0.485762 |
Filling In Missing Data¶
df.fillna(0)
0 | 1 | 2 | |
---|---|---|---|
0 | -0.147671 | 0.000000 | 0.000000 |
1 | 0.808575 | 0.000000 | 0.000000 |
2 | -0.417914 | 0.000000 | 1.339269 |
3 | -1.114100 | 0.000000 | -1.934764 |
4 | -0.369869 | -0.100167 | 0.338057 |
5 | 1.358480 | -1.395844 | -2.302565 |
6 | 0.537828 | 0.146920 | -0.485762 |
df.fillna({1: 0.5, 2: 0})
0 | 1 | 2 | |
---|---|---|---|
0 | -0.147671 | 0.500000 | 0.000000 |
1 | 0.808575 | 0.500000 | 0.000000 |
2 | -0.417914 | 0.500000 | 1.339269 |
3 | -1.114100 | 0.500000 | -1.934764 |
4 | -0.369869 | -0.100167 | 0.338057 |
5 | 1.358480 | -1.395844 | -2.302565 |
6 | 0.537828 | 0.146920 | -0.485762 |
df.fillna(0, inplace=True)
df
0 | 1 | 2 | |
---|---|---|---|
0 | -0.147671 | 0.000000 | 0.000000 |
1 | 0.808575 | 0.000000 | 0.000000 |
2 | -0.417914 | 0.000000 | 1.339269 |
3 | -1.114100 | 0.000000 | -1.934764 |
4 | -0.369869 | -0.100167 | 0.338057 |
5 | 1.358480 | -1.395844 | -2.302565 |
6 | 0.537828 | 0.146920 | -0.485762 |
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
#df.fillna(method='ffill') # forward fill with last valid observation (bfill to fill backward)
df.fillna(method='ffill', limit=2) #number of consecutive NaN values to forward/backward fill
0 | 1 | 2 | |
---|---|---|---|
0 | 0.331259 | -0.619757 | 1.100359 |
1 | 0.469276 | -0.696190 | -0.816206 |
2 | 0.617573 | -0.696190 | 0.235891 |
3 | 3.014322 | -0.696190 | -0.004664 |
4 | 1.199192 | NaN | -0.004664 |
5 | -0.835414 | NaN | -0.004664 |
data = pd.DataFrame([1., NA, 3.5, NA, 7])
data.fillna(data.mean()) #fills NA with data mean value
0 | |
---|---|
0 | 1.000000 |
1 | 3.833333 |
2 | 3.500000 |
3 | 3.833333 |
4 | 7.000000 |
Data Transformation¶
Removing Duplicates¶
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
6 | two | 4 |
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data.drop_duplicates()
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
data
data.drop_duplicates(['k1', 'k2'], keep='last')
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
6 | two | 4 | 6 |
Transforming Data Using a Function or Mapping¶
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food | ounces | |
---|---|---|
0 | bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data
food | ounces | animal | |
---|---|---|---|
0 | bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
data['food'].str.lower().map(lambda x: meat_to_animal[x])
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
Replacing Values¶
data = pd.DataFrame([1., -999., 2., -999., -1000., 3.])
data
0 | |
---|---|
0 | 1.0 |
1 | -999.0 |
2 | 2.0 |
3 | -999.0 |
4 | -1000.0 |
5 | 3.0 |
data.replace(-999, np.nan)
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | -1000.0 |
5 | 3.0 |
data.replace([-999, -1000], np.nan)
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | NaN |
5 | 3.0 |
data.replace([-999, -1000], [np.nan, 0])
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | 0.0 |
5 | 3.0 |
data=data.replace({-999: np.nan, -1000: 0})
data.replace(np.nan, 0)
0 | |
---|---|
0 | 1.0 |
1 | 0.0 |
2 | 2.0 |
3 | 0.0 |
4 | 0.0 |
5 | 3.0 |
Renaming Axis Indexes¶
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one | two | three | four | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colorado | 4 | 5 | 6 | 7 |
New York | 8 | 9 | 10 | 11 |
transform = lambda x: x[:4].upper()
data.index.map(transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(transform)
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
data.rename(index=str.title, columns=str.upper)
ONE | TWO | THREE | FOUR | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colo | 4 | 5 | 6 | 7 |
New | 8 | 9 | 10 | 11 |
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
one | two | peekaboo | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
one | two | three | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
Discretization and Binning¶
ages = [18, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[NaN, (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], (18.0, 25.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
cats.categories
pd.value_counts(cats)
(18, 25] 4
(25, 35] 3
(35, 60] 3
(60, 100] 1
dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
[NaN, 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
data = np.random.rand(20)
data
pd.cut(data, 4, precision=2)
[(0.05, 0.27], (0.05, 0.27], (0.27, 0.49], (0.27, 0.49], (0.05, 0.27], ..., (0.49, 0.71], (0.05, 0.27], (0.71, 0.93], (0.27, 0.49], (0.27, 0.49]]
Length: 20
Categories (4, interval[float64]): [(0.05, 0.27] < (0.27, 0.49] < (0.49, 0.71] < (0.71, 0.93]]
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 10) # Cut into quartiles
cats
pd.value_counts(cats)
(-3.318, -1.219] 100
(-1.219, -0.777] 100
(-0.777, -0.503] 100
(-0.503, -0.227] 100
(-0.227, 0.0661] 100
(0.0661, 0.315] 100
(0.315, 0.615] 100
(0.615, 0.977] 100
(0.977, 1.348] 100
(1.348, 3.401] 100
dtype: int64
cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
pd.value_counts(cats)
(-1.219, 0.0661] 400
(0.0661, 1.348] 400
(-3.318, -1.219] 100
(1.348, 3.401] 100
dtype: int64
Detecting and Filtering Outliers¶
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.055077 | 0.045030 | -0.025765 | -0.000986 |
std | 0.993835 | 0.978408 | 0.978154 | 1.031257 |
min | -3.832766 | -3.076625 | -3.213987 | -3.419100 |
25% | -0.583017 | -0.613001 | -0.719985 | -0.683580 |
50% | 0.055401 | 0.039207 | -0.010730 | -0.012271 |
75% | 0.707915 | 0.764889 | 0.677561 | 0.673964 |
max | 3.739664 | 3.000617 | 2.741589 | 3.223334 |
col = data[2]
col[np.abs(col) > 3]
420 -3.213987
485 -3.023981
Name: 2, dtype: float64
data[(np.abs(data) > 3).any(1)]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
111 | -1.237494 | 0.385405 | 1.708786 | 3.223334 |
186 | 0.129223 | -0.257331 | -1.856794 | -3.419100 |
308 | -0.324471 | 3.000617 | -1.237963 | -0.085539 |
356 | 3.192386 | 0.567717 | 0.174435 | 0.320250 |
420 | 0.679497 | -2.224678 | -3.213987 | -0.504708 |
425 | -3.832766 | -1.893903 | 0.340340 | -0.327178 |
473 | 0.363863 | -3.076625 | -0.344005 | -0.413175 |
485 | 0.124100 | 1.183545 | -3.023981 | 0.579325 |
541 | 3.190462 | -0.028757 | -0.047572 | 2.121152 |
753 | 3.739664 | 0.733766 | -0.229820 | 0.368722 |
823 | 1.983805 | -0.933073 | -1.768958 | -3.194717 |
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.054787 | 0.045106 | -0.025527 | -0.000596 |
std | 0.987260 | 0.978164 | 0.977406 | 1.028689 |
min | -3.000000 | -3.000000 | -3.000000 | -3.000000 |
25% | -0.583017 | -0.613001 | -0.719985 | -0.683580 |
50% | 0.055401 | 0.039207 | -0.010730 | -0.012271 |
75% | 0.707915 | 0.764889 | 0.677561 | 0.673964 |
max | 3.000000 | 3.000000 | 2.741589 | 3.000000 |
data.head()
np.sign(data).head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 1.0 | 1.0 | -1.0 | -1.0 |
1 | 1.0 | 1.0 | 1.0 | -1.0 |
2 | 1.0 | -1.0 | -1.0 | -1.0 |
3 | 1.0 | -1.0 | 1.0 | -1.0 |
4 | -1.0 | -1.0 | 1.0 | -1.0 |
Permutation and Random Sampling¶
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df
#sampler = np.random.permutation(5)
#sampler
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
2 | 8 | 9 | 10 | 11 |
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
df
df.take(sampler)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-58-2cab25988b0f> in <module>
1 df
----> 2 df.take(sampler)
NameError: name 'sampler' is not defined
df.sample(n=3) #selects 3 random rows(default).
0 | 1 | 2 | 3 | |
---|---|---|---|---|
3 | 12 | 13 | 14 | 15 |
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
3 6
0 5
0 5
3 6
2 -1
3 6
0 5
0 5
4 4
4 4
dtype: int64
Computing Indicator/Dummy Variables¶
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
df
pd.get_dummies(df['key'])
a | b | c | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 1 | 0 | 0 |
3 | 0 | 0 | 1 |
4 | 1 | 0 | 0 |
5 | 0 | 1 | 0 |
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 | key_a | key_b | key_c | |
---|---|---|---|---|
0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 1 | 0 |
2 | 2 | 1 | 0 | 0 |
3 | 3 | 0 | 0 | 1 |
4 | 4 | 1 | 0 | 0 |
5 | 5 | 0 | 1 | 0 |
String Manipulation¶
String Object Methods¶
val = 'a,b, guido'
val.split(',')
['a', 'b', ' guido']
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
third
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
val.index(',')
val.find(':')
-1
#val.index(':')
val.count('a')
1
val
#val.replace(',', '::')
val.replace(',', '')
'ab guido'
Regular Expressions¶
import re
text = "foo bar\t baz \tqux"
re.split('\s+', text) #All spaces (space, tab, newline...)
['foo', 'bar', 'baz', 'qux']
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
regex.findall(text)
[' ', '\t ', ' \t']
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' #r for raw string
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
m = regex.search(text)
text[m.start():m.end()]
'dave@google.com'
print(regex.match(text))
None
print(regex.sub('REDACTED', text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
regex.findall(text)
[('dave', 'google', 'com'),
('steve', 'gmail', 'com'),
('rob', 'gmail', 'com'),
('ryan', 'yahoo', 'com')]
print(regex.sub(r'\nUsername: \1, Domain: \2, Suffix: \3', text))
Dave
Username: dave, Domain: google, Suffix: com
Steve
Username: steve, Domain: gmail, Suffix: com
Rob
Username: rob, Domain: gmail, Suffix: com
Ryan
Username: ryan, Domain: yahoo, Suffix: com
Vectorized String Functions in pandas¶
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()
Dave False
Steve False
Rob False
Wes True
dtype: bool
data.str.contains('gmail')
Dave False
Steve True
Rob True
Wes NaN
dtype: object
pattern
data.str.findall(pattern, flags=re.IGNORECASE)
Dave [(dave, google, com)]
Steve [(steve, gmail, com)]
Rob [(rob, gmail, com)]
Wes NaN
dtype: object
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
Dave True
Steve True
Rob True
Wes NaN
dtype: object
data.str.get(1)
data.str[1]
Dave a
Steve t
Rob o
Wes NaN
dtype: object
data.str[:5]
Dave dave@
Steve steve
Rob rob@g
Wes NaN
dtype: object