Lecture 7 - Functions, Pandas, Data Cleaning, and Preparation¶

In this lecture we will talk more about:

  • Functions in Python

  • Pandas Dataframes

Functions in Python¶

def doubleNum(nb):
    num=nb*2 
    return num

number = 3
print("The double is", doubleNum(number))
The double is 6
number = 4
print("The double is", (lambda x: x*2)(number))
The double is 8

Pandas¶

This tutorial has been adapted from the creator of Pandas, Wes McKinney.

import numpy as np
import pandas as pd

# Remember to always import pandas

Data loading¶

Last time, and in Lab 3, we talked about loading data in Python using pandas.

Let’s talk now about how to clean and prepare data using Pandas.

Handling Missing Data¶

string_data = pd.DataFrame(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()
0
0 False
1 False
2 True
3 False
string_data[0] = None
string_data
string_data.isnull()
0
0 True
1 True
2 True
3 True

Filtering Out Missing Data¶

from numpy import nan as NA
data = pd.DataFrame([1, NA, 3.5, NA, 7])
data
0
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
data.dropna()
0
0 1.0
2 3.5
4 7.0
data[data.notnull()]
0
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
cleaned
0 1 2
0 1.0 6.5 3.0
data.dropna(how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.5 3.0
data[4] = NA
data
0 1 2 4
0 1.0 6.5 3.0 NaN
1 1.0 NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN 6.5 3.0 NaN
data.dropna(axis=1, how='all')
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.5 3.0
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df
0 1 2
0 -0.147671 NaN NaN
1 0.808575 NaN NaN
2 -0.417914 NaN 1.339269
3 -1.114100 NaN -1.934764
4 -0.369869 -0.100167 0.338057
5 1.358480 -1.395844 -2.302565
6 0.537828 0.146920 -0.485762
df.dropna()
0 1 2
4 -0.369869 -0.100167 0.338057
5 1.358480 -1.395844 -2.302565
6 0.537828 0.146920 -0.485762
#Keep only the rows with at least 2 non-NA values.
df.dropna(thresh=2)
0 1 2
2 -0.417914 NaN 1.339269
3 -1.114100 NaN -1.934764
4 -0.369869 -0.100167 0.338057
5 1.358480 -1.395844 -2.302565
6 0.537828 0.146920 -0.485762

Filling In Missing Data¶

df.fillna(0)
0 1 2
0 -0.147671 0.000000 0.000000
1 0.808575 0.000000 0.000000
2 -0.417914 0.000000 1.339269
3 -1.114100 0.000000 -1.934764
4 -0.369869 -0.100167 0.338057
5 1.358480 -1.395844 -2.302565
6 0.537828 0.146920 -0.485762
df.fillna({1: 0.5, 2: 0})
0 1 2
0 -0.147671 0.500000 0.000000
1 0.808575 0.500000 0.000000
2 -0.417914 0.500000 1.339269
3 -1.114100 0.500000 -1.934764
4 -0.369869 -0.100167 0.338057
5 1.358480 -1.395844 -2.302565
6 0.537828 0.146920 -0.485762
df.fillna(0, inplace=True)
df
0 1 2
0 -0.147671 0.000000 0.000000
1 0.808575 0.000000 0.000000
2 -0.417914 0.000000 1.339269
3 -1.114100 0.000000 -1.934764
4 -0.369869 -0.100167 0.338057
5 1.358480 -1.395844 -2.302565
6 0.537828 0.146920 -0.485762
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
#df.fillna(method='ffill') # forward fill with last valid observation (bfill to fill backward)
df.fillna(method='ffill', limit=2) #number of consecutive NaN values to forward/backward fill
0 1 2
0 0.331259 -0.619757 1.100359
1 0.469276 -0.696190 -0.816206
2 0.617573 -0.696190 0.235891
3 3.014322 -0.696190 -0.004664
4 1.199192 NaN -0.004664
5 -0.835414 NaN -0.004664
data = pd.DataFrame([1., NA, 3.5, NA, 7])
data.fillna(data.mean()) #fills NA with data mean value
0
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000

Data Transformation¶

Removing Duplicates¶

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
6 two 4
data.duplicated()
0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool
data.drop_duplicates()
k1 k2
0 one 1
1 two 1
2 one 2
3 two 3
4 one 3
5 two 4
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])
k1 k2 v1
0 one 1 0
1 two 1 1
data
data.drop_duplicates(['k1', 'k2'], keep='last')
k1 k2 v1
0 one 1 0
1 two 1 1
2 one 2 2
3 two 3 3
4 one 3 4
6 two 4 6

Transforming Data Using a Function or Mapping¶

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
food ounces
0 bacon 4.0
1 pulled pork 3.0
2 bacon 12.0
3 Pastrami 6.0
4 corned beef 7.5
5 Bacon 8.0
6 pastrami 3.0
7 honey ham 5.0
8 nova lox 6.0
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data
food ounces animal
0 bacon 4.0 pig
1 pulled pork 3.0 pig
2 bacon 12.0 pig
3 Pastrami 6.0 cow
4 corned beef 7.5 cow
5 Bacon 8.0 pig
6 pastrami 3.0 cow
7 honey ham 5.0 pig
8 nova lox 6.0 salmon
data['food'].str.lower().map(lambda x: meat_to_animal[x])
0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing Values¶

data = pd.DataFrame([1., -999., 2., -999., -1000., 3.])
data
0
0 1.0
1 -999.0
2 2.0
3 -999.0
4 -1000.0
5 3.0
data.replace(-999, np.nan)
0
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
data.replace([-999, -1000], np.nan)
0
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
data.replace([-999, -1000], [np.nan, 0])
0
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
data=data.replace({-999: np.nan, -1000: 0})
data.replace(np.nan, 0)
0
0 1.0
1 0.0
2 2.0
3 0.0
4 0.0
5 3.0

Renaming Axis Indexes¶

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
New York 8 9 10 11
transform = lambda x: x[:4].upper()
data.index.map(transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data.index = data.index.map(transform)
data
one two three four
OHIO 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
data.rename(index=str.title, columns=str.upper)
ONE TWO THREE FOUR
Ohio 0 1 2 3
Colo 4 5 6 7
New 8 9 10 11
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})
one two peekaboo four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
data
one two three four
OHIO 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data
one two three four
INDIANA 0 1 2 3
COLO 4 5 6 7
NEW 8 9 10 11

Discretization and Binning¶

ages = [18, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[NaN, (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], (18.0, 25.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats.codes
cats.categories
pd.value_counts(cats)
(18, 25]     4
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
[NaN, 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']
data = np.random.rand(20)
data
pd.cut(data, 4, precision=2)
[(0.05, 0.27], (0.05, 0.27], (0.27, 0.49], (0.27, 0.49], (0.05, 0.27], ..., (0.49, 0.71], (0.05, 0.27], (0.71, 0.93], (0.27, 0.49], (0.27, 0.49]]
Length: 20
Categories (4, interval[float64]): [(0.05, 0.27] < (0.27, 0.49] < (0.49, 0.71] < (0.71, 0.93]]
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 10)  # Cut into quartiles
cats
pd.value_counts(cats)
(-3.318, -1.219]    100
(-1.219, -0.777]    100
(-0.777, -0.503]    100
(-0.503, -0.227]    100
(-0.227, 0.0661]    100
(0.0661, 0.315]     100
(0.315, 0.615]      100
(0.615, 0.977]      100
(0.977, 1.348]      100
(1.348, 3.401]      100
dtype: int64
cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
pd.value_counts(cats)
(-1.219, 0.0661]    400
(0.0661, 1.348]     400
(-3.318, -1.219]    100
(1.348, 3.401]      100
dtype: int64

Detecting and Filtering Outliers¶

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.055077 0.045030 -0.025765 -0.000986
std 0.993835 0.978408 0.978154 1.031257
min -3.832766 -3.076625 -3.213987 -3.419100
25% -0.583017 -0.613001 -0.719985 -0.683580
50% 0.055401 0.039207 -0.010730 -0.012271
75% 0.707915 0.764889 0.677561 0.673964
max 3.739664 3.000617 2.741589 3.223334
col = data[2]
col[np.abs(col) > 3]
420   -3.213987
485   -3.023981
Name: 2, dtype: float64
data[(np.abs(data) > 3).any(1)]
0 1 2 3
111 -1.237494 0.385405 1.708786 3.223334
186 0.129223 -0.257331 -1.856794 -3.419100
308 -0.324471 3.000617 -1.237963 -0.085539
356 3.192386 0.567717 0.174435 0.320250
420 0.679497 -2.224678 -3.213987 -0.504708
425 -3.832766 -1.893903 0.340340 -0.327178
473 0.363863 -3.076625 -0.344005 -0.413175
485 0.124100 1.183545 -3.023981 0.579325
541 3.190462 -0.028757 -0.047572 2.121152
753 3.739664 0.733766 -0.229820 0.368722
823 1.983805 -0.933073 -1.768958 -3.194717
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()
0 1 2 3
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.054787 0.045106 -0.025527 -0.000596
std 0.987260 0.978164 0.977406 1.028689
min -3.000000 -3.000000 -3.000000 -3.000000
25% -0.583017 -0.613001 -0.719985 -0.683580
50% 0.055401 0.039207 -0.010730 -0.012271
75% 0.707915 0.764889 0.677561 0.673964
max 3.000000 3.000000 2.741589 3.000000
data.head()
np.sign(data).head()
0 1 2 3
0 1.0 1.0 -1.0 -1.0
1 1.0 1.0 1.0 -1.0
2 1.0 -1.0 -1.0 -1.0
3 1.0 -1.0 1.0 -1.0
4 -1.0 -1.0 1.0 -1.0

Permutation and Random Sampling¶

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df
#sampler = np.random.permutation(5)
#sampler
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
4 16 17 18 19
df
df.take(sampler)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-2cab25988b0f> in <module>
      1 df
----> 2 df.take(sampler)

NameError: name 'sampler' is not defined
df.sample(n=3) #selects 3 random rows(default).
0 1 2 3
3 12 13 14 15
4 16 17 18 19
2 8 9 10 11
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws
3    6
0    5
0    5
3    6
2   -1
3    6
0    5
0    5
4    4
4    4
dtype: int64

Computing Indicator/Dummy Variables¶

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df
pd.get_dummies(df['key'])
a b c
0 0 1 0
1 0 1 0
2 1 0 0
3 0 0 1
4 1 0 0
5 0 1 0
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
data1 key_a key_b key_c
0 0 0 1 0
1 1 0 1 0
2 2 1 0 0
3 3 0 0 1
4 4 1 0 0
5 5 0 1 0

String Manipulation¶

String Object Methods¶

val = 'a,b,  guido'
val.split(',')
['a', 'b', '  guido']
pieces = [x.strip() for x in val.split(',')]
pieces
['a', 'b', 'guido']
first, second, third = pieces
third
first + '::' + second + '::' + third
'a::b::guido'
'::'.join(pieces)
'a::b::guido'
'guido' in val
val.index(',')
val.find(':')
-1
#val.index(':')
val.count('a')
1
val
#val.replace(',', '::')
val.replace(',', '')
'ab  guido'

Regular Expressions¶

import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)  #All spaces (space, tab, newline...)
['foo', 'bar', 'baz', 'qux']
regex = re.compile('\s+')
regex.split(text)
['foo', 'bar', 'baz', 'qux']
regex.findall(text)
['    ', '\t ', '  \t']
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' #r for raw string

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)
regex.findall(text)
['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']
m = regex.search(text)
text[m.start():m.end()]
'dave@google.com'
print(regex.match(text))
None
print(regex.sub('REDACTED', text))
Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)
m = regex.match('wesm@bright.net')
m.groups()
('wesm', 'bright', 'net')
regex.findall(text)
[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]
print(regex.sub(r'\nUsername: \1, Domain: \2, Suffix: \3', text))
Dave 
Username: dave, Domain: google, Suffix: com
Steve 
Username: steve, Domain: gmail, Suffix: com
Rob 
Username: rob, Domain: gmail, Suffix: com
Ryan 
Username: ryan, Domain: yahoo, Suffix: com

Vectorized String Functions in pandas¶

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()
Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool
data.str.contains('gmail')
Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object
pattern
data.str.findall(pattern, flags=re.IGNORECASE)
Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches
Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object
data.str.get(1)
data.str[1]
Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object
data.str[:5]
Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object