Lecture 7 - Functions, Pandas, Data Cleaning, and Preparation¶

In this lecture we will talk more about:

Functions in Python
Pandas Dataframes

Functions in Python¶

def doubleNum(nb):
    num=nb*2 
    return num

number = 3
print("The double is", doubleNum(number))

The double is 6

number = 4
print("The double is", (lambda x: x*2)(number))

The double is 8

Pandas¶

This tutorial has been adapted from the creator of Pandas, Wes McKinney.

import numpy as np
import pandas as pd

# Remember to always import pandas

Data loading¶

Last time, and in Lab 3, we talked about loading data in Python using pandas.

Let’s talk now about how to clean and prepare data using Pandas.

Handling Missing Data¶

string_data = pd.DataFrame(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data
string_data.isnull()

	0
0	False
1	False
2	True
3	False

string_data[0] = None
string_data
string_data.isnull()

	0
0	True
1	True
2	True
3	True

Filtering Out Missing Data¶

from numpy import nan as NA
data = pd.DataFrame([1, NA, 3.5, NA, 7])
data

	0
0	1.0
1	NaN
2	3.5
3	NaN
4	7.0

data.dropna()

	0
0	1.0
2	3.5
4	7.0

data[data.notnull()]

	0
0	1.0
1	NaN
2	3.5
3	NaN
4	7.0

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

cleaned

	0	1	2
0	1.0	6.5	3.0

data.dropna(how='all')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
3	NaN	6.5	3.0

data[4] = NA
data

	0	1	2	4
0	1.0	6.5	3.0	NaN
1	1.0	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN
3	NaN	6.5	3.0	NaN

data.dropna(axis=1, how='all')

	0	1	2
0	1.0	6.5	3.0
1	1.0	NaN	NaN
2	NaN	NaN	NaN
3	NaN	6.5	3.0

df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

	0	1	2
0	-0.147671	NaN	NaN
1	0.808575	NaN	NaN
2	-0.417914	NaN	1.339269
3	-1.114100	NaN	-1.934764
4	-0.369869	-0.100167	0.338057
5	1.358480	-1.395844	-2.302565
6	0.537828	0.146920	-0.485762

df.dropna()

	0	1	2
4	-0.369869	-0.100167	0.338057
5	1.358480	-1.395844	-2.302565
6	0.537828	0.146920	-0.485762

#Keep only the rows with at least 2 non-NA values.
df.dropna(thresh=2)

	0	1	2
2	-0.417914	NaN	1.339269
3	-1.114100	NaN	-1.934764
4	-0.369869	-0.100167	0.338057
5	1.358480	-1.395844	-2.302565
6	0.537828	0.146920	-0.485762

Filling In Missing Data¶

df.fillna(0)

	0	1	2
0	-0.147671	0.000000	0.000000
1	0.808575	0.000000	0.000000
2	-0.417914	0.000000	1.339269
3	-1.114100	0.000000	-1.934764
4	-0.369869	-0.100167	0.338057
5	1.358480	-1.395844	-2.302565
6	0.537828	0.146920	-0.485762

df.fillna({1: 0.5, 2: 0})

	0	1	2
0	-0.147671	0.500000	0.000000
1	0.808575	0.500000	0.000000
2	-0.417914	0.500000	1.339269
3	-1.114100	0.500000	-1.934764
4	-0.369869	-0.100167	0.338057
5	1.358480	-1.395844	-2.302565
6	0.537828	0.146920	-0.485762

df.fillna(0, inplace=True)
df

	0	1	2
0	-0.147671	0.000000	0.000000
1	0.808575	0.000000	0.000000
2	-0.417914	0.000000	1.339269
3	-1.114100	0.000000	-1.934764
4	-0.369869	-0.100167	0.338057
5	1.358480	-1.395844	-2.302565
6	0.537828	0.146920	-0.485762

df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df
#df.fillna(method='ffill') # forward fill with last valid observation (bfill to fill backward)
df.fillna(method='ffill', limit=2) #number of consecutive NaN values to forward/backward fill

      
    

	0	1	2
0	0.331259	-0.619757	1.100359
1	0.469276	-0.696190	-0.816206
2	0.617573	-0.696190	0.235891
3	3.014322	-0.696190	-0.004664
4	1.199192	NaN	-0.004664
5	-0.835414	NaN	-0.004664

data = pd.DataFrame([1., NA, 3.5, NA, 7])
data.fillna(data.mean()) #fills NA with data mean value

	0
0	1.000000
1	3.833333
2	3.500000
3	3.833333
4	7.000000

Data Transformation¶

Removing Duplicates¶

data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4
6	two	4

data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

data.drop_duplicates()

	k1	k2
0	one	1
1	two	1
2	one	2
3	two	3
4	one	3
5	two	4

data['v1'] = range(7)
data
data.drop_duplicates(['k1'])

	k1	k2	v1
0	one	1	0
1	two	1	1

data
data.drop_duplicates(['k1', 'k2'], keep='last')

	k1	k2	v1
0	one	1	0
1	two	1	1
2	one	2	2
3	two	3	3
4	one	3	4
6	two	4	6

Transforming Data Using a Function or Mapping¶

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

      
    

	food	ounces
0	bacon	4.0
1	pulled pork	3.0
2	bacon	12.0
3	Pastrami	6.0
4	corned beef	7.5
5	Bacon	8.0
6	pastrami	3.0
7	honey ham	5.0
8	nova lox	6.0

meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

      
    

lowercased = data['food'].str.lower()
lowercased
data['animal'] = lowercased.map(meat_to_animal)
data

	food	ounces	animal
0	bacon	4.0	pig
1	pulled pork	3.0	pig
2	bacon	12.0	pig
3	Pastrami	6.0	cow
4	corned beef	7.5	cow
5	Bacon	8.0	pig
6	pastrami	3.0	cow
7	honey ham	5.0	pig
8	nova lox	6.0	salmon

data['food'].str.lower().map(lambda x: meat_to_animal[x])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

Replacing Values¶

data = pd.DataFrame([1., -999., 2., -999., -1000., 3.])
data

	0
0	1.0
1	-999.0
2	2.0
3	-999.0
4	-1000.0
5	3.0

data.replace(-999, np.nan)

	0
0	1.0
1	NaN
2	2.0
3	NaN
4	-1000.0
5	3.0

data.replace([-999, -1000], np.nan)

	0
0	1.0
1	NaN
2	2.0
3	NaN
4	NaN
5	3.0

data.replace([-999, -1000], [np.nan, 0])

	0
0	1.0
1	NaN
2	2.0
3	NaN
4	0.0
5	3.0

data=data.replace({-999: np.nan, -1000: 0})

data.replace(np.nan, 0)

	0
0	1.0
1	0.0
2	2.0
3	0.0
4	0.0
5	3.0

Renaming Axis Indexes¶

data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
New York	8	9	10	11

transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

data.index = data.index.map(transform)
data

	one	two	three	four
OHIO	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

data.rename(index=str.title, columns=str.upper)

	ONE	TWO	THREE	FOUR
Ohio	0	1	2	3
Colo	4	5	6	7
New	8	9	10	11

data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

	one	two	peekaboo	four
INDIANA	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

data

	one	two	three	four
OHIO	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

data.rename(index={'OHIO': 'INDIANA'}, inplace=True)
data

	one	two	three	four
INDIANA	0	1	2	3
COLO	4	5	6	7
NEW	8	9	10	11

Discretization and Binning¶

ages = [18, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[NaN, (18.0, 25.0], (18.0, 25.0], (25.0, 35.0], (18.0, 25.0], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

cats.codes
cats.categories
pd.value_counts(cats)

(18, 25]     4
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[NaN, 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

data = np.random.rand(20)
data
pd.cut(data, 4, precision=2)

[(0.05, 0.27], (0.05, 0.27], (0.27, 0.49], (0.27, 0.49], (0.05, 0.27], ..., (0.49, 0.71], (0.05, 0.27], (0.71, 0.93], (0.27, 0.49], (0.27, 0.49]]
Length: 20
Categories (4, interval[float64]): [(0.05, 0.27] < (0.27, 0.49] < (0.49, 0.71] < (0.71, 0.93]]

data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 10)  # Cut into quartiles
cats
pd.value_counts(cats)

(-3.318, -1.219]    100
(-1.219, -0.777]    100
(-0.777, -0.503]    100
(-0.503, -0.227]    100
(-0.227, 0.0661]    100
(0.0661, 0.315]     100
(0.315, 0.615]      100
(0.615, 0.977]      100
(0.977, 1.348]      100
(1.348, 3.401]      100
dtype: int64

cats = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
pd.value_counts(cats)

(-1.219, 0.0661]    400
(0.0661, 1.348]     400
(-3.318, -1.219]    100
(1.348, 3.401]      100
dtype: int64

Detecting and Filtering Outliers¶

data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.055077	0.045030	-0.025765	-0.000986
std	0.993835	0.978408	0.978154	1.031257
min	-3.832766	-3.076625	-3.213987	-3.419100
25%	-0.583017	-0.613001	-0.719985	-0.683580
50%	0.055401	0.039207	-0.010730	-0.012271
75%	0.707915	0.764889	0.677561	0.673964
max	3.739664	3.000617	2.741589	3.223334

col = data[2]
col[np.abs(col) > 3]

420   -3.213987
485   -3.023981
Name: 2, dtype: float64

data[(np.abs(data) > 3).any(1)]

	0	1	2	3
111	-1.237494	0.385405	1.708786	3.223334
186	0.129223	-0.257331	-1.856794	-3.419100
308	-0.324471	3.000617	-1.237963	-0.085539
356	3.192386	0.567717	0.174435	0.320250
420	0.679497	-2.224678	-3.213987	-0.504708
425	-3.832766	-1.893903	0.340340	-0.327178
473	0.363863	-3.076625	-0.344005	-0.413175
485	0.124100	1.183545	-3.023981	0.579325
541	3.190462	-0.028757	-0.047572	2.121152
753	3.739664	0.733766	-0.229820	0.368722
823	1.983805	-0.933073	-1.768958	-3.194717

data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

	0	1	2	3
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.054787	0.045106	-0.025527	-0.000596
std	0.987260	0.978164	0.977406	1.028689
min	-3.000000	-3.000000	-3.000000	-3.000000
25%	-0.583017	-0.613001	-0.719985	-0.683580
50%	0.055401	0.039207	-0.010730	-0.012271
75%	0.707915	0.764889	0.677561	0.673964
max	3.000000	3.000000	2.741589	3.000000

data.head()
np.sign(data).head()

	0	1	2	3
0	1.0	1.0	-1.0	-1.0
1	1.0	1.0	1.0	-1.0
2	1.0	-1.0	-1.0	-1.0
3	1.0	-1.0	1.0	-1.0
4	-1.0	-1.0	1.0	-1.0

Permutation and Random Sampling¶

df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df
#sampler = np.random.permutation(5)
#sampler

	0	1	2	3
0	0	1	2	3
1	4	5	6	7
2	8	9	10	11
3	12	13	14	15
4	16	17	18	19

df
df.take(sampler)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-58-2cab25988b0f> in <module>
      1 df
----> 2 df.take(sampler)

NameError: name 'sampler' is not defined

      
    

df.sample(n=3) #selects 3 random rows(default).

	0	1	2	3
3	12	13	14	15
4	16	17	18	19
2	8	9	10	11

choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)
draws

3    6
0    5
0    5
3    6
2   -1
3    6
0    5
0    5
4    4
4    4
dtype: int64

Computing Indicator/Dummy Variables¶

df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df
pd.get_dummies(df['key'])

	a	b	c
0	0	1	0
1	0	1	0
2	1	0	0
3	0	0	1
4	1	0	0
5	0	1	0

dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

	data1	key_a	key_b	key_c
0	0	0	1	0
1	1	0	1	0
2	2	1	0	0
3	3	0	0	1
4	4	1	0	0
5	5	0	1	0

String Manipulation¶

String Object Methods¶

val = 'a,b,  guido'
val.split(',')

['a', 'b', '  guido']

pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

first, second, third = pieces
third
first + '::' + second + '::' + third

'a::b::guido'

'::'.join(pieces)

'a::b::guido'

'guido' in val
val.index(',')
val.find(':')

-1

#val.index(':')

val.count('a')

val
#val.replace(',', '::')
val.replace(',', '')

'ab  guido'

Regular Expressions¶

import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)  #All spaces (space, tab, newline...)

['foo', 'bar', 'baz', 'qux']

regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

regex.findall(text)

['    ', '\t ', '  \t']

text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' #r for raw string

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

      
    

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

m = regex.search(text)
text[m.start():m.end()]

'dave@google.com'

print(regex.match(text))

None

print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED

pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

m = regex.match('wesm@bright.net')
m.groups()

('wesm', 'bright', 'net')

regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

print(regex.sub(r'\nUsername: \1, Domain: \2, Suffix: \3', text))

Dave 
Username: dave, Domain: google, Suffix: com
Steve 
Username: steve, Domain: gmail, Suffix: com
Rob 
Username: rob, Domain: gmail, Suffix: com
Ryan 
Username: ryan, Domain: yahoo, Suffix: com

Vectorized String Functions in pandas¶

data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data
data.isnull()

      
    

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

pattern
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

data.str.get(1)
data.str[1]

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

DATA 301