etl.py 4.47 KB
Newer Older
Sanjay Krishnan committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
'''
The etl module defines basic language primitives for manipulating Pandas
DataFrames. It takes a DataFrame in and outputs a transformed DataFrame.
You will implement several of the routines to perform these transformations.

Example Usage:

You can create DataFrame and create an ETL class that takes the DataFrame
as input.

>> df1 = pd.DataFrame([['Bob', 'Stewart'],
                       ['Anna', 'Davis'],
                       ['Jerry', 'Dole'],
                       ['John', 'Marsh']],
                      columns=["first_name", "last_name"])
>> etl = ETL(df1)

The add() function creates a new column with a specified value:

>> etl.add(colname="age", 0)
>> pw1.df
  first_name last_name  age
0        Bob   Stewart    0
1       Anna     Davis    0
2      Jerry      Dole    0
3       John     Marsh    0
'''

import pandas as pd
import re


class ETL:
    '''
    The class that defines ETL transformations for a single dataframe
    '''

    def __init__(self, df: pd.DataFrame):
        '''
        ETL objects are constructed with a source
        dataframe. These dataframes are manipulated
        in-place.
        '''

        #how to access the source dataframe
        self.df = df

        #stores a history of the transformations to the df
        self.transforms = []


    def add(self, colname, x):
        '''
        The add(colname, x) function adds a column with the specified name
        (colname) and a specified value (x). It adds this value to
        all rows of the dataframe. We've implemented this as an
        example to show you how to structure your ETL functions. 

        add *modifies* self.df as well as *returns* it
        '''

        #Test to see if colname is None, if so use a default colname
        self.df[colname] = x

        #append your changes to the transform list
        self.transforms.append(self.df.copy(deep=True))
        return self.df



    def drop(self, colname):
        '''
        The drop(colname) function returns a DataFrame 
        with the column (colname) removed.

        drop *modifies* self.df as well as *returns* it
        '''

        #YOUR CODE HERE

        self.transforms.append(self.df.copy(deep=True))
        return self.df

    def copy(self, colname, new_colname):
        '''
        copy(colname, new_colname) duplicates a column and 
        saves it to the new_colname.

        copy *modifies* self.df as well as *returns* it.
        '''

        #YOUR CODE HERE

        self.transforms.append(self.df.copy(deep=True))
        return self.df


    def split(self, colname, new_colname, splitter):
        '''
        split(colname, new_colname, splitter) takes a column
        splits the value on a delimiter. It replaces colname
        with the substrings that appear before the delimiter
        and puts the values after the delimiter in the 
        new_colname. If the string does not contain the delimiter
        then new_colname is assigned an empty string.

        split *modifies* self.df as well as *returns* it.
        '''

        #YOUR CODE HERE


        self.transforms.append(self.df.copy(deep=True))
        return self.df


    def merge(self, col1, col2, splitter):
        '''
        merge(col1, col2, splitter) replaces col1
        with the values of col1 and col2 concatenated,
        and seperated by the delimiter. The delimiter is
        ignored if either df.col1 or df.col2 is an empty 
        string.

        merge *modifies* self.df as well as *returns* it. 
        '''

        #YOUR CODE HERE


        self.transforms.append(self.df.copy(deep=True))
        return self.df


    def format(self, colname, fn):
        '''
        format applies an input function to every value in colname. fn
        is a *function*.

        format *modifies* self.df as well as *returns* it.
        '''
        
        #YOUR CODE HERE


        self.transforms.append(self.df.copy(deep=True))
        return self.df


    def divide(self, colname, new_colname1, new_colname2, condition):
        '''
        Divide conditionally divides a column, sending values that 
        satisfy the condition into one of two columns 
        (new_colname1 or new_colname2). condition is a Boolean function
        of values.

        See examples in the writeup.
        '''

        #YOUR CODE HERE

        self.transforms.append(self.df.copy(deep=True))
        return self.df