Lecture 7 - Functions, Pandas, Data Cleaning, and Preparation¶
In this lecture we will talk more about:
Functions in Python
Writing python programs and running them from the Terminal
Pandas Dataframes
Pandas¶
This tutorial has been adapted from the creator of Pandas, Wes McKinney.
import numpy as np
import pandas as pd
# Remember to always import pandas
Data loading¶
Last time, and in Lab 3, we talked about loading data in Python using pandas.
Let’s talk now about how to clean and prepare data using Pandas.
Handling Missing Data¶
string_data = pd.DataFrame(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
0 | |
---|---|
0 | False |
1 | False |
2 | True |
3 | False |
string_data[0] = None
string_data.isnull()
0 | |
---|---|
0 | True |
1 | True |
2 | True |
3 | True |
Filtering Out Missing Data¶
from numpy import nan as NA
data = pd.DataFrame([1, NA, 3.5, NA, 7])
data.dropna()
0 | |
---|---|
0 | 1.0 |
2 | 3.5 |
4 | 7.0 |
data[data.notnull()]
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 3.5 |
3 | NaN |
4 | 7.0 |
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
cleaned
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
data.dropna(how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
data[4] = NA
data
0 | 1 | 2 | 4 | |
---|---|---|---|---|
0 | 1.0 | 6.5 | 3.0 | NaN |
1 | 1.0 | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 | NaN |
data.dropna(axis=1, how='all')
0 | 1 | 2 | |
---|---|---|---|
0 | 1.0 | 6.5 | 3.0 |
1 | 1.0 | NaN | NaN |
2 | NaN | NaN | NaN |
3 | NaN | 6.5 | 3.0 |
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
0 | 1 | 2 | |
---|---|---|---|
0 | -1.695111 | NaN | NaN |
1 | 0.047417 | NaN | NaN |
2 | 0.479154 | NaN | -0.763134 |
3 | 1.023613 | NaN | 0.717773 |
4 | -0.213959 | -0.187290 | 0.171453 |
5 | -1.242170 | -0.003887 | -1.705038 |
6 | 1.025683 | -1.007867 | 0.084850 |
df.dropna()
0 | 1 | 2 | |
---|---|---|---|
4 | -0.213959 | -0.187290 | 0.171453 |
5 | -1.242170 | -0.003887 | -1.705038 |
6 | 1.025683 | -1.007867 | 0.084850 |
df.dropna(thresh=2)
0 | 1 | 2 | |
---|---|---|---|
2 | 0.479154 | NaN | -0.763134 |
3 | 1.023613 | NaN | 0.717773 |
4 | -0.213959 | -0.187290 | 0.171453 |
5 | -1.242170 | -0.003887 | -1.705038 |
6 | 1.025683 | -1.007867 | 0.084850 |
Filling In Missing Data¶
df.fillna(0)
0 | 1 | 2 | |
---|---|---|---|
0 | -1.695111 | 0.000000 | 0.000000 |
1 | 0.047417 | 0.000000 | 0.000000 |
2 | 0.479154 | 0.000000 | -0.763134 |
3 | 1.023613 | 0.000000 | 0.717773 |
4 | -0.213959 | -0.187290 | 0.171453 |
5 | -1.242170 | -0.003887 | -1.705038 |
6 | 1.025683 | -1.007867 | 0.084850 |
df.fillna({1: 0.5, 2: 0})
0 | 1 | 2 | |
---|---|---|---|
0 | -1.695111 | 0.500000 | 0.000000 |
1 | 0.047417 | 0.500000 | 0.000000 |
2 | 0.479154 | 0.500000 | -0.763134 |
3 | 1.023613 | 0.500000 | 0.717773 |
4 | -0.213959 | -0.187290 | 0.171453 |
5 | -1.242170 | -0.003887 | -1.705038 |
6 | 1.025683 | -1.007867 | 0.084850 |
_ = df.fillna(0, inplace=True)
df
0 | 1 | 2 | |
---|---|---|---|
0 | -1.695111 | 0.000000 | 0.000000 |
1 | 0.047417 | 0.000000 | 0.000000 |
2 | 0.479154 | 0.000000 | -0.763134 |
3 | 1.023613 | 0.000000 | 0.717773 |
4 | -0.213959 | -0.187290 | 0.171453 |
5 | -1.242170 | -0.003887 | -1.705038 |
6 | 1.025683 | -1.007867 | 0.084850 |
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)
0 | 1 | 2 | |
---|---|---|---|
0 | -0.648247 | -0.485004 | -1.217676 |
1 | 1.215154 | 0.401557 | 1.171716 |
2 | 0.388848 | 0.401557 | -0.747839 |
3 | 2.503947 | 0.401557 | -2.352020 |
4 | 0.880440 | NaN | -2.352020 |
5 | 0.528105 | NaN | -2.352020 |
data = pd.DataFrame([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
0 | |
---|---|
0 | 1.000000 |
1 | 3.833333 |
2 | 3.500000 |
3 | 3.833333 |
4 | 7.000000 |
Data Transformation¶
Removing Duplicates¶
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
6 | two | 4 |
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data.drop_duplicates()
k1 | k2 | |
---|---|---|
0 | one | 1 |
1 | two | 1 |
2 | one | 2 |
3 | two | 3 |
4 | one | 3 |
5 | two | 4 |
data['v1'] = range(7)
data.drop_duplicates(['k1'])
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
data.drop_duplicates(['k1', 'k2'], keep='last')
k1 | k2 | v1 | |
---|---|---|---|
0 | one | 1 | 0 |
1 | two | 1 | 1 |
2 | one | 2 | 2 |
3 | two | 3 | 3 |
4 | one | 3 | 4 |
6 | two | 4 | 6 |
Transforming Data Using a Function or Mapping¶
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
'Pastrami', 'corned beef', 'Bacon',
'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food | ounces | |
---|---|---|
0 | bacon | 4.0 |
1 | pulled pork | 3.0 |
2 | bacon | 12.0 |
3 | Pastrami | 6.0 |
4 | corned beef | 7.5 |
5 | Bacon | 8.0 |
6 | pastrami | 3.0 |
7 | honey ham | 5.0 |
8 | nova lox | 6.0 |
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data
food | ounces | animal | |
---|---|---|---|
0 | bacon | 4.0 | pig |
1 | pulled pork | 3.0 | pig |
2 | bacon | 12.0 | pig |
3 | Pastrami | 6.0 | cow |
4 | corned beef | 7.5 | cow |
5 | Bacon | 8.0 | pig |
6 | pastrami | 3.0 | cow |
7 | honey ham | 5.0 | pig |
8 | nova lox | 6.0 | salmon |
data['food'].map(lambda x: meat_to_animal[x.lower()])
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
Replacing Values¶
data = pd.DataFrame([1., -999., 2., -999., -1000., 3.])
data
0 | |
---|---|
0 | 1.0 |
1 | -999.0 |
2 | 2.0 |
3 | -999.0 |
4 | -1000.0 |
5 | 3.0 |
data.replace(-999, np.nan)
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | -1000.0 |
5 | 3.0 |
data.replace([-999, -1000], np.nan)
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | NaN |
5 | 3.0 |
data.replace([-999, -1000], [np.nan, 0])
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | 0.0 |
5 | 3.0 |
data.replace({-999: np.nan, -1000: 0})
0 | |
---|---|
0 | 1.0 |
1 | NaN |
2 | 2.0 |
3 | NaN |
4 | 0.0 |
5 | 3.0 |
Renaming Axis Indexes¶
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index.map(transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(transform)
data
one | two | three | four | |
---|---|---|---|---|
OHIO | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
data.rename(index=str.title, columns=str.upper)
ONE | TWO | THREE | FOUR | |
---|---|---|---|---|
Ohio | 0 | 1 | 2 | 3 |
Colo | 4 | 5 | 6 | 7 |
New | 8 | 9 | 10 | 11 |
data.rename(index={'OHIO': 'INDIANA'},
columns={'three': 'peekaboo'})
one | two | peekaboo | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
one | two | three | four | |
---|---|---|---|---|
INDIANA | 0 | 1 | 2 | 3 |
COLO | 4 | 5 | 6 | 7 |
NEW | 8 | 9 | 10 | 11 |
Discretization and Binning¶
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
cats.categories
pd.value_counts(cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
data = np.random.rand(20)
pd.cut(data, 4, precision=2)
[(0.77, 0.99], (0.54, 0.77], (0.54, 0.77], (0.31, 0.54], (0.31, 0.54], ..., (0.081, 0.31], (0.081, 0.31], (0.54, 0.77], (0.54, 0.77], (0.081, 0.31]]
Length: 20
Categories (4, interval[float64]): [(0.081, 0.31] < (0.31, 0.54] < (0.54, 0.77] < (0.77, 0.99]]
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats
pd.value_counts(cats)
(0.669, 3.901] 250
(-0.0713, 0.669] 250
(-0.72, -0.0713] 250
(-3.187, -0.72] 250
dtype: int64
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
[(-3.187, -1.306], (-1.306, -0.0713], (-1.306, -0.0713], (-0.0713, 1.289], (-1.306, -0.0713], ..., (-1.306, -0.0713], (-0.0713, 1.289], (-1.306, -0.0713], (-0.0713, 1.289], (-0.0713, 1.289]]
Length: 1000
Categories (4, interval[float64]): [(-3.187, -1.306] < (-1.306, -0.0713] < (-0.0713, 1.289] < (1.289, 3.901]]
Detecting and Filtering Outliers¶
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.047349 | -0.002089 | 0.029590 | -0.027096 |
std | 0.997233 | 1.005238 | 0.978493 | 1.001752 |
min | -3.583059 | -2.924918 | -2.921146 | -3.141436 |
25% | -0.569986 | -0.671058 | -0.638040 | -0.690526 |
50% | 0.070691 | -0.023134 | 0.079285 | -0.037656 |
75% | 0.721326 | 0.640628 | 0.726295 | 0.668395 |
max | 3.009291 | 3.471942 | 3.501940 | 3.283467 |
col = data[2]
col[np.abs(col) > 3]
69 3.50194
Name: 2, dtype: float64
data[(np.abs(data) > 3).any(1)]
0 | 1 | 2 | 3 | |
---|---|---|---|---|
69 | 1.627038 | 0.161114 | 3.501940 | -1.397388 |
214 | 3.009291 | -0.513320 | 0.909048 | 0.994116 |
320 | -0.611410 | -0.425844 | 1.403428 | -3.141436 |
498 | -3.308480 | 1.015552 | 0.122005 | 0.798187 |
554 | 0.853870 | 0.938411 | 0.441925 | -3.056820 |
565 | -1.883857 | -0.372985 | 0.555274 | 3.283467 |
592 | -0.894171 | 0.734381 | -0.628007 | 3.019549 |
803 | -0.370629 | 3.471942 | 2.001244 | -0.260848 |
915 | 0.336541 | 1.110847 | 1.414507 | 3.011419 |
953 | -3.583059 | 0.952141 | -0.447062 | 0.578827 |
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
count | 1000.000000 | 1000.000000 | 1000.000000 | 1000.000000 |
mean | 0.048231 | -0.002560 | 0.029088 | -0.027213 |
std | 0.994255 | 1.003715 | 0.976837 | 1.000159 |
min | -3.000000 | -2.924918 | -2.921146 | -3.000000 |
25% | -0.569986 | -0.671058 | -0.638040 | -0.690526 |
50% | 0.070691 | -0.023134 | 0.079285 | -0.037656 |
75% | 0.721326 | 0.640628 | 0.726295 | 0.668395 |
max | 3.000000 | 3.000000 | 3.000000 | 3.000000 |
np.sign(data).head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 1.0 | -1.0 | -1.0 | 1.0 |
1 | 1.0 | 1.0 | 1.0 | -1.0 |
2 | -1.0 | -1.0 | 1.0 | -1.0 |
3 | 1.0 | -1.0 | 1.0 | -1.0 |
4 | -1.0 | -1.0 | -1.0 | -1.0 |
Permutation and Random Sampling¶
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
array([2, 4, 0, 1, 3])
df
df.take(sampler)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
2 | 8 | 9 | 10 | 11 |
4 | 16 | 17 | 18 | 19 |
0 | 0 | 1 | 2 | 3 |
1 | 4 | 5 | 6 | 7 |
3 | 12 | 13 | 14 | 15 |
df.sample(n=3)
0 | 1 | 2 | 3 | |
---|---|---|---|---|
4 | 16 | 17 | 18 | 19 |
2 | 8 | 9 | 10 | 11 |
0 | 0 | 1 | 2 | 3 |
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
1 7
0 5
3 6
1 7
1 7
1 7
0 5
4 4
0 5
2 -1
dtype: int64
Computing Indicator/Dummy Variables¶
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
pd.get_dummies(df['key'])
a | b | c | |
---|---|---|---|
0 | 0 | 1 | 0 |
1 | 0 | 1 | 0 |
2 | 1 | 0 | 0 |
3 | 0 | 0 | 1 |
4 | 1 | 0 | 0 |
5 | 0 | 1 | 0 |
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 | key_a | key_b | key_c | |
---|---|---|---|---|
0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 1 | 0 |
2 | 2 | 1 | 0 | 0 |
3 | 3 | 0 | 0 | 1 |
4 | 4 | 1 | 0 | 0 |
5 | 5 | 0 | 1 | 0 |
String Manipulation¶
String Object Methods¶
val = 'a,b, guido'
val.split(',')
['a', 'b', ' guido']
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
val.index(',')
val.find(':')
-1
val.index(':')
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-63-2c016e7367ac> in <module>
----> 1 val.index(':')
ValueError: substring not found
val.count(',')
val.replace(',', '::')
val.replace(',', '')
Regular Expressions¶
import re
text = "foo bar\t baz \tqux"
re.split('\s+', text)
regex = re.compile('\s+')
regex.split(text)
regex.findall(text)
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)
m = regex.search(text)
m
text[m.start():m.end()]
print(regex.match(text))
print(regex.sub('REDACTED', text))
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()
regex.findall(text)
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))
Vectorized String Functions in pandas¶
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()
data.str.contains('gmail')
pattern
data.str.findall(pattern, flags=re.IGNORECASE)
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
matches.str.get(1)
matches.str[0]
data.str[:5]
pd.options.display.max_rows = PREVIOUS_MAX_ROWS
Conclusion¶
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)