added the assingments

111ea51a · Sanjay Krishnan · 110f79f0 · 111ea51a · 111ea51a · 111ea51a
Commit 111ea51a authored Mar 30, 2020 by Sanjay Krishnan
Showing with 629 additions and 0 deletions
hw4/README.md
hw4/etl.py
hw4/etl_programs.py
hw5/README.md
hw5/auto_grader.py
hw5/core.py
hw5/countD.py
hw5/ooc.py
hw5/years.json
--- a/hw4/README.md
+++ b/hw4/README.md
--- a/hw4/etl.py
+++ b/hw4/etl.py
+'''
+The etl module defines basic language primitives for manipulating Pandas
+DataFrames. It takes a DataFrame in and outputs a transformed DataFrame.
+You will implement several of the routines to perform these transformations.
+
+Example Usage:
+
+You can create DataFrame and create an ETL class that takes the DataFrame
+as input.
+
+>> df1 = pd.DataFrame([['Bob', 'Stewart'],
+                       ['Anna', 'Davis'],
+                       ['Jerry', 'Dole'],
+                       ['John', 'Marsh']],
+                      columns=["first_name", "last_name"])
+>> etl = ETL(df1)
+
+The add() function creates a new column with a specified value:
+
+>> etl.add(colname="age", 0)
+>> pw1.df
+  first_name last_name  age
+0        Bob   Stewart    0
+1       Anna     Davis    0
+2      Jerry      Dole    0
+3       John     Marsh    0
+'''
+
+import pandas as pd
+import re
+
+
+class ETL:
+    '''
+    The class that defines ETL transformations for a single dataframe
+    '''
+
+    def __init__(self, df: pd.DataFrame):
+        '''
+        ETL objects are constructed with a source
+        dataframe. These dataframes are manipulated
+        in-place.
+        '''
+
+        #how to access the source dataframe
+        self.df = df
+
+        #stores a history of the transformations to the df
+        self.transforms = []
+
+
+    def add(self, colname, x):
+        '''
+        The add(colname, x) function adds a column with the specified name
+        (colname) and a specified value (x). It adds this value to
+        all rows of the dataframe. We've implemented this as an
+        example to show you how to structure your ETL functions. 
+
+        add *modifies* self.df as well as *returns* it
+        '''
+
+        #Test to see if colname is None, if so use a default colname
+        self.df[colname] = x
+
+        #append your changes to the transform list
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
+
+
+
+    def drop(self, colname):
+        '''
+        The drop(colname) function returns a DataFrame 
+        with the column (colname) removed.
+
+        drop *modifies* self.df as well as *returns* it
+        '''
+
+        #YOUR CODE HERE
+
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
+
+    def copy(self, colname, new_colname):
+        '''
+        copy(colname, new_colname) duplicates a column and 
+        saves it to the new_colname.
+
+        copy *modifies* self.df as well as *returns* it.
+        '''
+
+        #YOUR CODE HERE
+
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
+
+
+    def split(self, colname, new_colname, splitter):
+        '''
+        split(colname, new_colname, splitter) takes a column
+        splits the value on a delimiter. It replaces colname
+        with the substrings that appear before the delimiter
+        and puts the values after the delimiter in the 
+        new_colname. If the string does not contain the delimiter
+        then new_colname is assigned an empty string.
+
+        split *modifies* self.df as well as *returns* it.
+        '''
+
+        #YOUR CODE HERE
+
+
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
+
+
+    def merge(self, col1, col2, splitter):
+        '''
+        merge(col1, col2, splitter) replaces col1
+        with the values of col1 and col2 concatenated,
+        and seperated by the delimiter. The delimiter is
+        ignored if either df.col1 or df.col2 is an empty 
+        string.
+
+        merge *modifies* self.df as well as *returns* it. 
+        '''
+
+        #YOUR CODE HERE
+
+
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
+
+
+    def format(self, colname, fn):
+        '''
+        format applies an input function to every value in colname. fn
+        is a *function*.
+
+        format *modifies* self.df as well as *returns* it.
+        '''
+        
+        #YOUR CODE HERE
+
+
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
+
+
+    def divide(self, colname, new_colname1, new_colname2, condition):
+        '''
+        Divide conditionally divides a column, sending values that 
+        satisfy the condition into one of two columns 
+        (new_colname1 or new_colname2). condition is a Boolean function
+        of values.
+
+        See examples in the writeup.
+        '''
+
+        #YOUR CODE HERE
+
+        self.transforms.append(self.df.copy(deep=True))
+        return self.df
--- a/hw4/etl_programs.py
+++ b/hw4/etl_programs.py
+'''
+Here, you will write programs that transform dataframes
+using the functions that you wrote.
+'''
+
+def phone():
+    '''
+    Write an ETL program that results in a
+    dataframe with two columns: area_code, phone_number.
+    '''
+    df = pd.DataFrame([['(408)996-758'],
+                      ['+1 667 798 0304'],
+                      ['(774)998-758'],
+                      ['+1 442 030 9595']],
+                      columns=["phoneno"])
+    etl = ETL(df)
+
+
+    #Your code goes here
+
+
+    return etl.df
+
+
+def date():
+    '''
+    Write an ETL program that results in a
+    dataframe with three columns: day, month, year.
+    The day must be in two-digit format i.e, 01, 08.
+    The month must be the full month name, e.g., "May".
+    The year must be in YYYY format.
+    '''
+    df = pd.DataFrame([['03/2/1990'],
+                      ['2/14/1964'],
+                      ['1990-04-30'],
+                      ['7/9/2012'],
+                      ['1989-09-13'],
+                      ['1994-08-21'],
+                      ['1996-11-30'],
+                      ['2004-12-23'],
+                      ['4/21/2016']]
+                      columns=["date"])
+    etl = ETL(df)
+
+
+    #Your code goes here
+
+
+    return etl.df
+
+
+def name():
+    '''
+    Write an ETL program that correctly formats names
+    into first_name and last_name.
+    '''
+    df = pd.DataFrame([['Such,Bob', ''],
+                    ['Ann', 'Davis'],
+                    ['Dole,Jerry', ''],
+                    ['Joan', 'Song']],
+                     columns=["first_name", "last_name"])
+    etl = ETL(df)
+
+
+    #Your code goes here
+
+
+    return etl.df
\ No newline at end of file
--- a/hw5/README.md
+++ b/hw5/README.md
+# Out-of-Core Group By Aggregate
+
+*Due 6/1/19 11:59 PM*
+
+In this assignment, you will implement an out-of-core
+version of the group by aggregate (aggregation by key)
+seen in lecture. You will have a set memory limit and 
+you will have to count the number of times a string shows 
+up in an iterator. Your program should work for any limit 
+> 20.
+
+## Getting Started
+First, pull the most recent changes from the cmsc13600-public repository:
+```
+$ git pull
+```
+Then, copy the `hw5` folder to your submission repository. Change directories to enter your submission repository. Your code will go into `countD.py` this is the only file that you will modify. Finally, add `countD.py` using `git add`:
+```
+$ git add countD.py
+$ git commit -m'initialized homework'
+```
+
+Now, you will need to fetch the data used in this assignment. Download title.csv put it in the hw5 folder:
+
+https://www.dropbox.com/s/zl7yt8cl0lvajxg/title.csv?dl=0
+
+DO NOT ADD title.csv to the git repo! After downloading the 
+dataset, there is a python module provided for you called `core.py`, which reads the dataset. This module loads the data in as
+an iterator in two functions `imdb_years()` and `imdb_title_words()`:
+```
+>> for i in imdb_years():
+... print(i)
+
+1992
+1986
+<so on>
+```
+Play around with both `imdb_years()` and `imdb_title_words()` to get a feel for how the data works. 
+
+## MemoryLimitedHashMap
+In this project, the main data structure is the `MemoryLimitedHashMap`. This is a hash map that has an explicit limit on the number of keys it can store. To create one of these data structure, you can import it from core module:
+```
+from core import *
+#create a memory limited hash map
+m = MemoryLimitedHashMap()
+```
+To find out what the limit of this hash map is, you can:
+```
+print("The max size of m is: ", m.limit)
+```
+The data structure can be constructed with an explicit limit (the default is 1000), e.g., `MemoryLimitedHashMap(limit=10)`.
+Adding data to this hash map is like you've probably seen before in a data structure class. There is a `put` function that takes in a key and assigns that key a value:
+```
+# put some keys
+m.put('a', 1)
+m.put('b', 45)
+print("The size of m is: ", m.size())
+```
+You can fetch the data using the `get` function and `keys` function:
+```
+# get keys
+for k in m.keys():
+    print("The value at key=", k, 'is', m.get(k))    
+
+# You can test to see if a key exists
+print('Does m contain a?', m.contains('a'))
+print('Does m contain c?', m.contains('c'))
+```
+When a key does not exist in the data structure the `get` function will raise an error:
+```
+#This gives an error: 
+m.get('c')
+```
+Similarly, if you assign too many unique keys (more than the limit) you will get an error:
+```
+for i in range(0,1001):
+    m.put(str(i), i)
+```
+The `MemoryLimitedHashMap` allows you to manage this limited storage with a `flush` function that allows you to persist a key and its assignment to disk. When you flush a key it removes it from the data structure and decrements the limit. Flush takes a key as a parameter.
+```
+m.flushKey('a')
+print("The size of m is: ", m.size())
+```
+Note that the disk is not intelligent! If you flush a key multiple times it simply appends the flushed value to a file on disk:
+```
+m.flushKey('a')
+<some work...>
+m.flushKey('a')
+```
+Once a key has been flushed it can be read back using the `load` function (which takes a key as a parameter). This loads back *all* of the flushed values:
+```
+#You can also load values from disk
+for k,v in m.load('a'):
+    print(k,v)
+```
+If you try to load a key  that has not been flushed, you will get an error:
+```
+#Error!!
+for k,v in m.load('d'):
+	print(k,v)
+```
+
+If you want multiple flushes of the same key to be differentiated, you can set a *subkey*:
+```
+#first flush
+m.flushKey('a', '0')
+
+<some work...>
+
+#second flush
+m.flushKey('a', '1')
+```
+The `load` function allows you to selectively pull 
+certain subkeys:
+```
+# pull only the first flush
+m.load('a', '0')
+```
+
+We can similarly iterate over all of the flushed data (which optionally takes a subkey as well!):
+```
+for k,v in m.loadAll():
+    print(k,v)
+```
+There is also a way to iterate over all of the flushed keys (will strip out subkeys):
+```
+m.fKeys()
+```
+
+## Count Per Group
+In this assignment, you will implement an out-of-core count operator which for all distinct strings in an iterator returns
+the number of times it appears (in no particular order). 
+For example,
+```
+In: "the", "cow", "jumped", "over", "the", "moon"
+Out: ("the",2), ("cow",1), ("jumped",1), ("over",1), ("moon",1)
+```
+Or,
+```
+In: "a", "b", "b", "a", "c"
+Out: ("c",1),("b",2), ("a", 2) 
+```
+The catch is that you CANNOT use a list, dictionary, or set from 
+Python. We provide a general purpose data structure called a MemoryLimitedHashMap (see ooc.py). You must maintain the iterator
+state using this data structure. 
+
+The class that you will implement is called Count (in countD.py).
+The constructor is written for you, and ittakes in an input iterator and a MemoryLimitedHashMap. You will use these objects
+in your implementation. You will have to implement `__next__` and `__iter__`. Any solution using a list, dictionary, or set inside `Count` will recieve 0 points.
+
+The hint is to do this in multiple passes and use a subkey to track keys flushed between different passes.
+
+## Testing and Submission
+ We have provided a series of basic tests in `auto_grader.py`, these tests are incomplete and are not meant to comprehensively grade your assignment. There is a file `years.json` with an expected output. After you finish the assignment you can submit your code with:
+```
+$ git push
+```
--- a/hw5/auto_grader.py
+++ b/hw5/auto_grader.py
+from countD import *
+from core import *
+from ooc import *
+import json
+
+test_file = open('years.json','r')
+expected = json.loads(test_file.read())
+
+for l in range(80, 140, 20):
+    m = MemoryLimitedHashMap(limit = l)
+    actual = {k:v for k,v in Count(imdb_years(), m)}
+    print("Memory Limit", l, expected == actual)
+
+
--- a/hw5/core.py
+++ b/hw5/core.py
+'''
+The core module sets up the data structures and 
+and references for this programming assignment.
+'''
+
+import platform
+import os
+import json
+
+if platform.system() == 'Windows':
+  print("This assignment will not work on a windows computer")
+  exit()
+
+def imdb_title_words():
+    f = open('title.csv','r')
+    line = f.readline()
+
+    while line != "":
+        words = line.strip().split(',')[1].split()
+
+        for w in words:
+            yield w
+
+        line = f.readline()
+
+    f.close()
+
+def imdb_years():
+    f = open('title.csv','r')
+    line = f.readline()
+
+    while line != "":
+        csvsplit = line.strip().split(',')
+        year = csvsplit[len(csvsplit) - 8]
+        
+        if year.strip() != "":
+            yield year
+
+        line = f.readline() 
+    f.close()
+
+  
+
+
+
+
+
--- a/hw5/countD.py
+++ b/hw5/countD.py
+"""
+Get the dataset first, download title.csv put it in the pa1 folder
+https://www.dropbox.com/s/zl7yt8cl0lvajxg/title.csv?dl=0
+
+Count the number of times each symbol shows up in an iterator
+with limited memory. Your program should work for any limit 
+> 20.
+"""
+
+class Count:
+    """
+    In this assignment, you will implement an out-of-core count 
+    operator which for all distinct strings in an iterator returns
+    the number of times it appears (in no particular order). 
+    For example,
+
+    In: "the", "cow", "jumped", "over", "the", "moon"
+    Out: ("the",2), ("cow",1), ("jumped",1), ("over",1), ("moon",1)
+
+    Or,
+
+    In: "a", "b", "b", "a", "c"
+    Out: ("c",1),("b",2), ("a", 2) 
+
+    The catch is that you CANNOT use a list, dictionary, or set from 
+    python. We provide a general purpose data structure called a 
+    MemoryLimitedHashMap (see ooc.py). You must maintain the iterator
+    state using this data structure. 
+    """
+
+    def __init__(self, input, memory_limit_hashmap):
+        '''
+        The constructor takes in an input iterator and
+        a MemoryLimitedHashMap. You will use these objects
+        in your implementation.
+        '''
+        self.in1 = input
+        self.hashmap = memory_limit_hashmap
+        
+    def __iter__(self):
+        raise NotImplemented("You must implement an initializer")
+
+
+    def __next__(self):
+        raise NotImplemented("You must implement a next function")
+
+
+
+
--- a/hw5/ooc.py
+++ b/hw5/ooc.py
+import os
+import json
+
+class MemoryLimitedHashMap(object):
+  '''
+  A MemoryLimitedHashMap simulates a hardware memory limit for a 
+  key-value data structure. It will raise an exception if the 
+  limit is exceeded.
+
+  Keys must be strings
+  '''
+
+  def __init__(self, diskfile='disk.file', limit=1000):
+    '''
+    The constructor takes a reference to a persistent file
+    and a memory limit.
+    '''
+
+    if os.path.exists(diskfile):
+        print("[Warning] Overwriting the Disk File", diskfile)
+
+        import shutil
+        shutil.rmtree(diskfile) 
+
+    os.mkdir(diskfile)
+    self.diskfile = diskfile
+    self._data = {}
+    self.limit = limit
+
+  def size(self):
+    return len(self._data)
+
+  def put(self, k, v):
+    '''
+    Basically works like dict put
+    '''
+
+    if not self.contains(k) and len(self._data) == self.limit:
+      raise ValueError("[Error] Attempting to Insert Into a Full Map: " + str((k,v)))
+    else:
+      self._data[k] = v
+
+
+  def get(self, k):
+    '''
+    Basically works like dict get
+    '''
+
+    return self._data[k]
+
+
+  def contains(self, k):
+    '''
+    Basically works like hash map contains
+    '''
+
+    return (k in self._data)
+
+
+  def keys(self):
+    '''
+    Returns a set of keys. Tuple
+    is (key, location)
+    '''
+
+    return set([k for k in self._data])
+
+
+  def fKeys(self):
+    '''
+    Returns a set over keys that have been flushed. 
+    Tuple is (key, location)
+    '''
+
+    return set([self.path2Key(k) for k in os.listdir(self.diskfile)])
+
+  def keyPath(self, k, subkey):
+    return self.diskfile+"/"+str(k)+ "_" + subkey
+
+  def path2Key(self, k):
+    key = k.split("_")[0]
+    return key
+
+  def flushKey(self, k, subkey):
+    '''
+    Removes the key from the dictionary and 
+    persists it to disk.
+    '''
+    if not self.contains(k):
+        raise ValueError("[Error] Map Does Not Contain " + k)
+
+    f = open(self.keyPath(k, subkey), 'a')
+    f.write(json.dumps(self.get(k)) + "\n")
+    f.close()
+
+    del self._data[k] #free up the space
+
+
+  def load(self, k, subkey=""):
+    '''
+    Streams all of the data from a persisted key 
+    '''
+    fname = self.keyPath(k, subkey)
+
+    if not os.path.exists(fname):
+        raise ValueError("[Error] Disk Does Not Contain " + k)
+
+    f = open(fname, 'r')
+    
+    line = f.readline()
+
+    while line != "":
+        yield (k, json.loads(line.strip()))
+        line = f.readline()
+
+  
+  def loadAll(self, subkey=""):
+    '''
+    Streams all of the data from all keys
+    '''
+
+    for k in self.keys():
+        yield (k, self.get(k))
+
+    for k in self.fKeys():
+        for _,v in self.load(k, subkey):
+            yield (k,v)
\ No newline at end of file
--- a/hw5/years.json
+++ b/hw5/years.json
+{"1944": 1831, "1917": 4514, "1907": 1487, "1975": 14054, "1880": 1, "2003": 67777, "2012": 164307, "1958": 9768, "1894": 93, "1904": 1136, "1979": 14926, "1915": 7670, "2010": 141703, "2017": 3, "1982": 14770, "1954": 7199, "2014": 3077, "1948": 2840, "1893": 2, "1923": 2614, "2005": 95005, "2009": 128696, "1992": 24917, "1949": 3847, "1945": 1730, "1896": 791, "1902": 1799, "1976": 13994, "1965": 13063, "1961": 11073, "1951": 5663, "1962": 10308, "1929": 2800, "2000": 53013, "1910": 4597, "1912": 7770, "1898": 1740, "1901": 1747, "1922": 3066, "1957": 9491, "1927": 2915, "2001": 58590, "1940": 2202, "2002": 62568, "1995": 36437, "1913": 8902, "1999": 50564, "1994": 30027, "1932": 2567, "1980": 14779, "1925": 2786, "1983": 15489, "1889": 2, "1973": 14284, "1936": 2798, "1892": 9, "1939": 2483, "1971": 14442, "1981": 14456, "2013": 63827, "1942": 2181, "1968": 14235, "1930": 2543, "1990": 23040, "2015": 401, "1914": 8125, "1909": 3417, "1906": 1104, "1920": 4012, "1972": 13623, "1891": 7, "1938": 2730, "1985": 18391, "1911": 5945, "1947": 2291, "1921": 3627, "1888": 5, "1963": 11153, "2016": 32, "1895": 120, "1964": 11416, "1977": 14038, "1984": 16571, "2008": 122861, "1997": 38955, "1955": 8007, "1900": 1816, "1989": 21312, "1966": 13711, "1967": 14601, "1950": 4763, "1918": 3781, "1890": 6, "1931": 2507, "1986": 19440, "1905": 800, "1919": 3613, "1960": 11121, "1899": 1787, "1943": 1960, "1969": 14349, "1974": 13736, "1988": 19861, "2007": 119565, "1935": 2467, "1959": 10517, "1903": 2618, "1926": 2847, "1916": 5835, "1978": 14428, "1937": 2795, "1993": 26775, "1908": 2712, "2004": 84593, "1996": 36509, "1924": 2615, "2006": 108429, "1953": 6834, "1952": 6346, "2019": 2, "1941": 2206, "1970": 15000, "1987": 20122, "2011": 160017, "1934": 2493, "1991": 23799, "1946": 1965, "1998": 46583, "1933": 2433, "1897": 1309, "1956": 8628, "1928": 2773}
\ No newline at end of file