Lecture 7 - Functions, Pandas, Data Cleaning, and Preparation¶

In this lecture we will talk more about:

  • Functions in Python

  • Writing python programs and running them from the Terminal

  • Pandas Dataframes

Pandas¶

This tutorial has been adapted from the creator of Pandas, Wes McKinney.

import numpy as np
import pandas as pd

# Remember to always import pandas

Data loading¶

Last time, and in Lab 3, we talked about loading data in Python using pandas.

Let’s talk now about how to clean and prepare data using Pandas.

Handling Missing Data¶

string_data = pd.DataFrame(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
0
0 False
1 False
2 True
3 False
string_data[0] = None
string_data.isnull()
0
0 True
1 True
2 True
3 True

Filtering Out Missing Data¶

from numpy import nan as NA
data = pd.DataFrame([1, NA, 3.5, NA, 7])
data.dropna()
0
0 1.0
2 3.5
4 7.0
data[data.notnull()]
0
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
cleaned
0 1 2
0 1.0 6.5 3.0
data.dropna(how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
data[4] = NA
data
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN
data.dropna(axis=1, how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
0 1 2
0 -1.695111 NaN NaN
1 0.047417 NaN NaN
2 0.479154 NaN -0.763134
3 1.023613 NaN 0.717773
4 -0.213959 -0.187290 0.171453
5 -1.242170 -0.003887 -1.705038
6 1.025683 -1.007867 0.084850
df.dropna()
0 1 2
4 -0.213959 -0.187290 0.171453
5 -1.242170 -0.003887 -1.705038
6 1.025683 -1.007867 0.084850
df.dropna(thresh=2)
0 1 2
2 0.479154 NaN -0.763134
3 1.023613 NaN 0.717773
4 -0.213959 -0.187290 0.171453
5 -1.242170 -0.003887 -1.705038
6 1.025683 -1.007867 0.084850

Filling In Missing Data¶

df.fillna(0)
0 1 2
0 -1.695111 0.000000 0.000000
1 0.047417 0.000000 0.000000
2 0.479154 0.000000 -0.763134
3 1.023613 0.000000 0.717773
4 -0.213959 -0.187290 0.171453
5 -1.242170 -0.003887 -1.705038
6 1.025683 -1.007867 0.084850
df.fillna({1: 0.5, 2: 0})
0 1 2
0 -1.695111 0.500000 0.000000
1 0.047417 0.500000 0.000000
2 0.479154 0.500000 -0.763134
3 1.023613 0.500000 0.717773
4 -0.213959 -0.187290 0.171453
5 -1.242170 -0.003887 -1.705038
6 1.025683 -1.007867 0.084850
_ = df.fillna(0, inplace=True)
df
0 1 2
0 -1.695111 0.000000 0.000000
1 0.047417 0.000000 0.000000
2 0.479154 0.000000 -0.763134
3 1.023613 0.000000 0.717773
4 -0.213959 -0.187290 0.171453
5 -1.242170 -0.003887 -1.705038
6 1.025683 -1.007867 0.084850
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
df.fillna(method='ffill')
df.fillna(method='ffill', limit=2)
0 1 2
0 -0.648247 -0.485004 -1.217676
1 1.215154 0.401557 1.171716
2 0.388848 0.401557 -0.747839
3 2.503947 0.401557 -2.352020
4 0.880440 NaN -2.352020
5 0.528105 NaN -2.352020
data = pd.DataFrame([1., NA, 3.5, NA, 7])
data.fillna(data.mean())
0
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000

Data Transformation¶

Removing Duplicates¶

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data['v1'] = range(7)
data.drop_duplicates(['k1'])
k1 k2 v1
0 one 1 0
1 two 1 1
data.drop_duplicates(['k1', 'k2'], keep='last')
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
6 two 4 6

Transforming Data Using a Function or Mapping¶

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
data['food'].map(lambda x: meat_to_animal[x.lower()])
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing Values¶

data = pd.DataFrame([1., -999., 2., -999., -1000., 3.])
data
0
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
data.replace(-999, np.nan)
0
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
data.replace([-999, -1000], np.nan)
0
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
data.replace([-999, -1000], [np.nan, 0])
0
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
data.replace({-999: np.nan, -1000: 0})
0
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0

Renaming Axis Indexes¶

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
transform = lambda x: x[:4].upper()
data.index.map(transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(transform)
data
one two three four
OHIO 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
data.rename(index=str.title, columns=str.upper)
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colo 4 5 6 7
New 8 9 10 11
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})
one two peekaboo four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
one two three four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11

Discretization and Binning¶

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
cats.categories
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
data = np.random.rand(20)
pd.cut(data, 4, precision=2)
[(0.77, 0.99], (0.54, 0.77], (0.54, 0.77], (0.31, 0.54], (0.31, 0.54], ..., (0.081, 0.31], (0.081, 0.31], (0.54, 0.77], (0.54, 0.77], (0.081, 0.31]]
Length: 20
Categories (4, interval[float64]): [(0.081, 0.31] < (0.31, 0.54] < (0.54, 0.77] < (0.77, 0.99]]
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats
pd.value_counts(cats)
(0.669, 3.901]      250
(-0.0713, 0.669]    250
(-0.72, -0.0713]    250
(-3.187, -0.72]     250
dtype: int64
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
[(-3.187, -1.306], (-1.306, -0.0713], (-1.306, -0.0713], (-0.0713, 1.289], (-1.306, -0.0713], ..., (-1.306, -0.0713], (-0.0713, 1.289], (-1.306, -0.0713], (-0.0713, 1.289], (-0.0713, 1.289]]
Length: 1000
Categories (4, interval[float64]): [(-3.187, -1.306] < (-1.306, -0.0713] < (-0.0713, 1.289] < (1.289, 3.901]]

Detecting and Filtering Outliers¶

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.047349 -0.002089 0.029590 -0.027096
std 0.997233 1.005238 0.978493 1.001752
min -3.583059 -2.924918 -2.921146 -3.141436
25% -0.569986 -0.671058 -0.638040 -0.690526
50% 0.070691 -0.023134 0.079285 -0.037656
75% 0.721326 0.640628 0.726295 0.668395
max 3.009291 3.471942 3.501940 3.283467
col = data[2]
col[np.abs(col) > 3]
69    3.50194
Name: 2, dtype: float64
data[(np.abs(data) > 3).any(1)]
0 1 2 3
69 1.627038 0.161114 3.501940 -1.397388
214 3.009291 -0.513320 0.909048 0.994116
320 -0.611410 -0.425844 1.403428 -3.141436
498 -3.308480 1.015552 0.122005 0.798187
554 0.853870 0.938411 0.441925 -3.056820
565 -1.883857 -0.372985 0.555274 3.283467
592 -0.894171 0.734381 -0.628007 3.019549
803 -0.370629 3.471942 2.001244 -0.260848
915 0.336541 1.110847 1.414507 3.011419
953 -3.583059 0.952141 -0.447062 0.578827
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.048231 -0.002560 0.029088 -0.027213
std 0.994255 1.003715 0.976837 1.000159
min -3.000000 -2.924918 -2.921146 -3.000000
25% -0.569986 -0.671058 -0.638040 -0.690526
50% 0.070691 -0.023134 0.079285 -0.037656
75% 0.721326 0.640628 0.726295 0.668395
max 3.000000 3.000000 3.000000 3.000000
np.sign(data).head()
0 1 2 3
0 1.0 -1.0 -1.0 1.0
1 1.0 1.0 1.0 -1.0
2 -1.0 -1.0 1.0 -1.0
3 1.0 -1.0 1.0 -1.0
4 -1.0 -1.0 -1.0 -1.0

Permutation and Random Sampling¶

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler
array([2, 4, 0, 1, 3])
df
df.take(sampler)
0 1 2 3
2 8 9 10 11
4 16 17 18 19
0 0 1 2 3
1 4 5 6 7
3 12 13 14 15
df.sample(n=3)
0 1 2 3
4 16 17 18 19
2 8 9 10 11
0 0 1 2 3
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
1    7
0    5
3    6
1    7
1    7
1    7
0    5
4    4
0    5
2   -1
dtype: int64

Computing Indicator/Dummy Variables¶

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
pd.get_dummies(df['key'])
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 key_a key_b key_c
0 0 0 1 0
1 1 0 1 0
2 2 1 0 0
3 3 0 0 1
4 4 1 0 0
5 5 0 1 0

String Manipulation¶

String Object Methods¶

val = 'a,b,  guido'
val.split(',')
['a', 'b', '  guido']
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
val.index(',')
val.find(':')
-1
val.index(':')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-63-2c016e7367ac> in <module>
----> 1 val.index(':')

ValueError: substring not found
val.count(',')
val.replace(',', '::')
val.replace(',', '')

Regular Expressions¶

import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)
regex = re.compile('\s+')
regex.split(text)
regex.findall(text)
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)
m = regex.search(text)
m
text[m.start():m.end()]
print(regex.match(text))
print(regex.sub('REDACTED', text))
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()
regex.findall(text)
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))

Vectorized String Functions in pandas¶

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()
data.str.contains('gmail')
pattern
data.str.findall(pattern, flags=re.IGNORECASE)
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
matches.str.get(1)
matches.str[0]
data.str[:5]
pd.options.display.max_rows = PREVIOUS_MAX_ROWS

Conclusion¶

PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)