List of Review topics#

  • functions

  • Using assert

  • dictionary comprehension

  • method chaining

  • assign and lambda for dataframes

  • coming up with EDA and questions

Task: Create a function named remove_middle which has three parameters named lst, start, and end. The function should return a list where all elements in lst with an index between start and end (inclusive) have been removed.

For example, the following code:

lst = [4, 8 , 15, 16, 23, 42] 
remove_middle(lst, start=1, end=3)

should return [4, 23, 42] because elements at indices 1, 2, and 3 have been removed.

Make sure that you test your function with at least two different inputs (test1 and test2), and use assert statements to confirm your function works correctly.

def remove_middle(lst, start, end):

    # Check to make sure the input is actually a list
    # for now, pretend it is a list.

    assert type(lst) == list, "This is not the expected form, please enter a list!"

    # Remove all elements between start and end
    new_list = []

    for i in range(len(lst)):
        # print(i)
        if i <= start:
            new_list.append(lst[i])

        elif i > end:
            new_list.append(lst[i])

    # Returns a list

    return new_list
sample_list = [4, 8, 15, 16, 23, 42]

remove_middle(sample_list, 1, 3)
[4, 8, 23, 42]
sample_list
[4, 8, 15, 16, 23, 42]
type(sample_list)
list
# This will return an error because of the assert statement
remove_middle((5, 4, 6, 7, 10), 1, 3)
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Cell In[5], line 2
      1 # This will return an error because of the assert statement
----> 2 remove_middle((5, 4, 6, 7, 10), 1, 3)

Cell In[1], line 6, in remove_middle(lst, start, end)
      1 def remove_middle(lst, start, end):
      2 
      3     # Check to make sure the input is actually a list
      4     # for now, pretend it is a list.
----> 6     assert type(lst) == list, "This is not the expected form, please enter a list!"
      8     # Remove all elements between start and end
      9     new_list = []

AssertionError: This is not the expected form, please enter a list!
# Assert statement to check output

remove_middle(sample_list, 1, 3)

assert remove_middle(sample_list, 1, 3) == [
    4,
    23,
    42,
], "There is something wrong with your function, please fix it!"
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Input In [54], in <cell line: 5>()
      1 # Assert statement to check output
      3 remove_middle(sample_list, 1, 3)
----> 5 assert remove_middle(sample_list, 1, 3) == [4,23,42], "There is something wrong with your function, please fix it!"

AssertionError: There is something wrong with your function, please fix it!
remove_middle(sample_list, 1, 3)
[4, 8, 23, 42]

Another function:#

  • Take in a list, return only numbers between 5 and 20 (not inclusive)

  • Call the function exclude_numbers()

  • return a list

sample_list
[4, 8, 15, 16, 23, 42]
exclude_numbers(sample_list)
The number 8 were between 5 and 20 and were excluded
The number 15 were between 5 and 20 and were excluded
The number 16 were between 5 and 20 and were excluded
[4, 23, 42]
def exclude_numbers(lst):

    new_list = []

    for i in range(len(lst)):
        if lst[i] < 5:
            new_list.append(lst[i])
        elif lst[i] > 20:
            new_list.append(lst[i])
        else:
            print(f"The number {lst[i]} were between 5 and 20 and were excluded")
    return new_list


assert exclude_numbers(sample_list) == [
    4,
    23,
    42,
], "There is something wrong with your function!"
The number 8 were between 5 and 20 and were excluded
The number 15 were between 5 and 20 and were excluded
The number 16 were between 5 and 20 and were excluded
exclude_numbers(sample_list)
The number 8 were between 5 and 20 and were excluded
The number 15 were between 5 and 20 and were excluded
The number 16 were between 5 and 20 and were excluded
[4, 23, 42]
def exclude_numbers2(lst):

    new_list = []

    for i in range(len(lst)):
        # this bit came from this SO answer: https://stackoverflow.com/a/13628825
        if 5 < lst[i] < 20:
            new_list.append(lst[i])

        else:
            print(f"The number {lst[i]} were between 5 and 20 and were excluded")
    return new_list


assert exclude_numbers2(sample_list) == [
    8,
    15,
    16,
], "There is something wrong with your function!"
The number 4 were between 5 and 20 and were excluded
The number 23 were between 5 and 20 and were excluded
The number 42 were between 5 and 20 and were excluded
# Attempt 3: using list comprehension


def exclude_numbers3(lst):

    return [l for l in lst if 5 < l < 20]
exclude_numbers3(sample_list)
[8, 15, 16]

Dictionary Comprehensions#

sample = {
    "key1": 5,
    "key2": 10,
    "key3": 15,
    "key4": 20,
    "key5": 10,
    "key6": 5,
    "key7": 0,
}
{jack: blue * 100 for (jack, blue) in sample.items() if blue > 10}
{'key3': 1500, 'key4': 2000}
# items iterates over the dictionary

for k, v in sample.items():
    print(k, v)
key1 5
key2 10
key3 15
key4 20
key5 10
key6 5
key7 0

Method Chaining#

import seaborn as sns
df = sns.load_dataset("planets")

df.head()
method number orbital_period mass distance year
0 Radial Velocity 1 269.300 7.10 77.40 2006
1 Radial Velocity 1 874.774 2.21 56.95 2008
2 Radial Velocity 1 763.000 2.60 19.84 2011
3 Radial Velocity 1 326.030 19.40 110.62 2007
4 Radial Velocity 1 516.220 10.50 119.47 2009
sorted(df["year"].unique())
[1989,
 1992,
 1994,
 1995,
 1996,
 1997,
 1998,
 1999,
 2000,
 2001,
 2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014]
# only look at data from 2000 onwards:
df[df["year"] > 2000]

# to verify:
# df[df['year']>2000]['year'].unique()
method number orbital_period mass distance year
0 Radial Velocity 1 269.300000 7.10 77.40 2006
1 Radial Velocity 1 874.774000 2.21 56.95 2008
2 Radial Velocity 1 763.000000 2.60 19.84 2011
3 Radial Velocity 1 326.030000 19.40 110.62 2007
4 Radial Velocity 1 516.220000 10.50 119.47 2009
... ... ... ... ... ... ...
1030 Transit 1 3.941507 NaN 172.00 2006
1031 Transit 1 2.615864 NaN 148.00 2007
1032 Transit 1 3.191524 NaN 174.00 2007
1033 Transit 1 4.125083 NaN 293.00 2008
1034 Transit 1 4.187757 NaN 260.00 2008

987 rows × 6 columns

df["orbital_period"].describe()
count       992.000000
mean       2002.917596
std       26014.728304
min           0.090706
25%           5.442540
50%          39.979500
75%         526.005000
max      730000.000000
Name: orbital_period, dtype: float64
# Task: exclude all the orbital periods over 40

df[df["orbital_period"] <= 40]

# to verify:df[df['orbital_period']<=40]['orbital_period'].max()
method number orbital_period mass distance year
16 Radial Velocity 1 4.230785 0.472 15.36 1995
17 Radial Velocity 5 14.651000 0.800 12.53 1996
20 Radial Velocity 5 0.736540 NaN 12.53 2011
22 Radial Velocity 3 4.215000 0.016 8.52 2009
23 Radial Velocity 3 38.021000 0.057 8.52 2009
... ... ... ... ... ... ...
1030 Transit 1 3.941507 NaN 172.00 2006
1031 Transit 1 2.615864 NaN 148.00 2007
1032 Transit 1 3.191524 NaN 174.00 2007
1033 Transit 1 4.125083 NaN 293.00 2008
1034 Transit 1 4.187757 NaN 260.00 2008

496 rows × 6 columns

# Task: rename column to something else

df.columns
Index(['method', 'number', 'orbital_period', 'mass', 'distance', 'year'], dtype='object')
# turns everything into upper case
[c.upper() for c in df.columns]
['METHOD', 'NUMBER', 'ORBITAL_PERIOD', 'MASS', 'DISTANCE', 'YEAR']
# dictionary comprehension
{c: c.upper() for c in df.columns}
{'method': 'METHOD',
 'number': 'NUMBER',
 'orbital_period': 'ORBITAL_PERIOD',
 'mass': 'MASS',
 'distance': 'DISTANCE',
 'year': 'YEAR'}
df.rename(columns={c: c.upper() for c in df.columns})
METHOD NUMBER ORBITAL_PERIOD MASS DISTANCE YEAR
0 Radial Velocity 1 269.300000 7.10 77.40 2006
1 Radial Velocity 1 874.774000 2.21 56.95 2008
2 Radial Velocity 1 763.000000 2.60 19.84 2011
3 Radial Velocity 1 326.030000 19.40 110.62 2007
4 Radial Velocity 1 516.220000 10.50 119.47 2009
... ... ... ... ... ... ...
1030 Transit 1 3.941507 NaN 172.00 2006
1031 Transit 1 2.615864 NaN 148.00 2007
1032 Transit 1 3.191524 NaN 174.00 2007
1033 Transit 1 4.125083 NaN 293.00 2008
1034 Transit 1 4.187757 NaN 260.00 2008

1035 rows × 6 columns

import numpy as np
a = np.random.randint(50, size=50)
a
array([47, 25, 12, 47, 25,  5,  5, 39, 33,  2, 46, 49,  5,  2, 20, 44, 37,
       15, 37, 45, 41, 22, 32, 38, 16, 47, 49, 49, 14, 17, 15, 44, 48, 34,
       46, 37, 13, 32, 32, 44, 20, 20, 10, 45, 16, 47,  9, 25, 42,  6])
np.where(a % 2 == 0, "Even", "Odd")
array(['Odd', 'Odd', 'Even', 'Odd', 'Odd', 'Odd', 'Odd', 'Odd', 'Odd',
       'Even', 'Even', 'Odd', 'Odd', 'Even', 'Even', 'Even', 'Odd', 'Odd',
       'Odd', 'Odd', 'Odd', 'Even', 'Even', 'Even', 'Even', 'Odd', 'Odd',
       'Odd', 'Even', 'Odd', 'Odd', 'Even', 'Even', 'Even', 'Even', 'Odd',
       'Odd', 'Even', 'Even', 'Even', 'Even', 'Even', 'Even', 'Odd',
       'Even', 'Odd', 'Odd', 'Odd', 'Even', 'Even'], dtype='<U4')
df = sns.load_dataset("planets")


df = (
    df.loc[lambda x: x["year"] > 2000]
    .loc[lambda x: x["orbital_period"] < 40]
    .assign(test_column=lambda x: np.where(x["distance"] > 12, "big", "small"))
    .drop(["number"], axis=1)
    .rename(columns={c: c.upper() for c in df.columns})
)

df.head()
METHOD ORBITAL_PERIOD MASS DISTANCE YEAR test_column
20 Radial Velocity 0.73654 NaN 12.53 2011 big
22 Radial Velocity 4.21500 0.0160 8.52 2009 small
23 Radial Velocity 38.02100 0.0570 8.52 2009 small
46 Radial Velocity 3.23570 0.0036 1.35 2012 small
79 Radial Velocity 5.60000 0.0450 42.09 2009 big