''' The etl module defines basic language primitives for manipulating Pandas DataFrames. It takes a DataFrame in and outputs a transformed DataFrame. You will implement several of the routines to perform these transformations. Example Usage: You can create DataFrame and create an ETL class that takes the DataFrame as input. >> df1 = pd.DataFrame([['Bob', 'Stewart'], ['Anna', 'Davis'], ['Jerry', 'Dole'], ['John', 'Marsh']], columns=["first_name", "last_name"]) >> etl = ETL(df1) The add() function creates a new column with a specified value: >> etl.add(colname="age", 0) >> pw1.df first_name last_name age 0 Bob Stewart 0 1 Anna Davis 0 2 Jerry Dole 0 3 John Marsh 0 ''' import pandas as pd import re class ETL: ''' The class that defines ETL transformations for a single dataframe ''' def __init__(self, df: pd.DataFrame): ''' ETL objects are constructed with a source dataframe. These dataframes are manipulated in-place. ''' #how to access the source dataframe self.df = df #stores a history of the transformations to the df self.transforms = [] def add(self, colname, x): ''' The add(colname, x) function adds a column with the specified name (colname) and a specified value (x). It adds this value to all rows of the dataframe. We've implemented this as an example to show you how to structure your ETL functions. add *modifies* self.df as well as *returns* it ''' #Test to see if colname is None, if so use a default colname self.df[colname] = x #append your changes to the transform list self.transforms.append(self.df.copy(deep=True)) return self.df def drop(self, colname): ''' The drop(colname) function returns a DataFrame with the column (colname) removed. drop *modifies* self.df as well as *returns* it ''' #YOUR CODE HERE self.transforms.append(self.df.copy(deep=True)) return self.df def copy(self, colname, new_colname): ''' copy(colname, new_colname) duplicates a column and saves it to the new_colname. copy *modifies* self.df as well as *returns* it. ''' #YOUR CODE HERE self.transforms.append(self.df.copy(deep=True)) return self.df def split(self, colname, new_colname, splitter): ''' split(colname, new_colname, splitter) takes a column splits the value on a delimiter. It replaces colname with the substrings that appear before the delimiter and puts the values after the delimiter in the new_colname. If the string does not contain the delimiter then new_colname is assigned an empty string. split *modifies* self.df as well as *returns* it. ''' #YOUR CODE HERE self.transforms.append(self.df.copy(deep=True)) return self.df def merge(self, col1, col2, splitter): ''' merge(col1, col2, splitter) replaces col1 with the values of col1 and col2 concatenated, and seperated by the delimiter. The delimiter is ignored if either df.col1 or df.col2 is an empty string. merge *modifies* self.df as well as *returns* it. ''' #YOUR CODE HERE self.transforms.append(self.df.copy(deep=True)) return self.df def format(self, colname, fn): ''' format applies an input function to every value in colname. fn is a *function*. format *modifies* self.df as well as *returns* it. ''' #YOUR CODE HERE self.transforms.append(self.df.copy(deep=True)) return self.df def divide(self, colname, new_colname1, new_colname2, condition): ''' Divide conditionally divides a column, sending values that satisfy the condition into one of two columns (new_colname1 or new_colname2). condition is a Boolean function of values. See examples in the writeup. ''' #YOUR CODE HERE self.transforms.append(self.df.copy(deep=True)) return self.df