pushing latest version

454363b4 · Ethan Mertz · be5fa434 · 454363b4 · 454363b4 · 454363b4
Commit 454363b4 authored Jun 03, 2018 by Ethan Mertz
Showing with 47 additions and 16 deletions
Python/pyfilesplit.py
Python/toptext.py
Python/topvariables.py
README.txt
find_top_python_variables.py
runsim.sh
util.py
--- a/Python/pyfilesplit.py
+++ b/Python/pyfilesplit.py
@@ -43,8 +43,8 @@ def go(subset):
 			current_file.append(line)
 		#Writes out the last file to the directory.
 		with open('pydata/pyfile' + str(count) +".txt", 'w') as new:
-					for l in current_file:
-						new.write(l)
+			for l in current_file:
+				new.write(l)

 if __name__ == "__main__":
 	subset = int(sys.argv[1])

--- a/Python/toptext.py
+++ b/Python/toptext.py
@@ -23,7 +23,10 @@ class TopText(MRJob):
        	filename = "pyfile" + file_number + ".txt"
        	file_text = filename.read()
        	for i in range(int(file1) + 1, int(total)):
-                #Get score for body text.
+                ++--
+ Python/pyfuncsplit.py                             |  24 +++++-
+ Python/topfunctions.py                            |  22 ++++--
+ Python/toptext.py                                 |  30 ++++--#Get score for body text.
        		comparison_file = "pyfile" + i + ".txt"
        		comparison_text = comparison_file.read()
        		ts = -jellyfish.jaro_winkler(filename_text,comparison_text)

--- a/Python/topvariables.py
+++ b/Python/topvariables.py
@@ -4,7 +4,7 @@ import heapq
 import jellyfish
 import math

-CAPACITY = 200
+CAPACITY = 500

 VARIABLE_REGEX = ("[\s]*([\w, ]+)[\s]*=")


--- a/README.txt
+++ b/README.txt
@@ -5,13 +5,13 @@ Graphics subdirectory: Contains all graphics from the report.
 Non-code subdirectory: Contains the project proposal file and the presentation slides.

 Python subdirectory: Contains the files used for our final Python analysis
-collectvariables.py: Pulls all of the unique variables from a code text file.
-pyfilesplit.py: Splits out the desired number of program files from the raw file.
-pyfuncsplit.py: Splits out the desired number of functions files from the raw file.
-topfunctions.py: Find the function names with the lowest mean edit distance from all other function names.
-toptext.py: Computes the least unique files overall.
-topvariables.py: Computes the least unique variable names.
-topvariables_intersection.py: Checks which variable files have the largest mean number of common variables with other files.
+	-collectvariables.py: Pulls all of the unique variables from a code text file.
+	-pyfilesplit.py: Splits out the desired number of program files from the raw file.
+	-pyfuncsplit.py: Splits out the desired number of functions files from the raw file.
+	-topfunctions.py: Find the function names with the lowest mean edit distance from all other function names.
+	-toptext.py: Computes the least unique files overall.
+	-topvariables.py: Computes the least unique variable names.
+	-topvariables_intersection.py: Checks which variable files have the largest mean number of common variables with other files.

 histogram.py: Used to construct the histograms for the report.


--- a/find_top_python_variables.py
+++ b/find_top_python_variables.py
+import re

 d = {}
-with open("python.txt") as p:
+with open("plot_graph.py") as p:
 	for line in p:
-		var = re.findall("[\s]*([\w, ]+)[\s]*=")
-			d[var] = d.get(var, 0) + 1	
+		var = re.findall("[\s]*([\w, ]+)[\s]*=[^=]", line)
+		if var:
+			d[var[0].strip()] = d.get(var[0].strip(), 0) + 1	
 l = []
 for item in d:
 	l.append((d[item], item))

--- a/runsim.sh
+++ b/runsim.sh
@@ -9,12 +9,12 @@ COM="$2 --jobconf mapreduce.job.reduces=1 "
 COUNTER=0
 for i in `seq 0 $1`
 do 
-	COM="$COM--file pyfunctions/pyfile$i.txt "
+	COM="$COM--file pyvariables/varfile$i.txt "
 done

 COM="$COM index.txt"

 echo $COM

-python3 $COM > results.txt
+python3 $COM > variable_scores.txt

--- a/util.py
+++ b/util.py
@@ -25,6 +25,19 @@ def funcsim(file1name,file2name):
    return (name1, name2, params1, params2, text1, text2)

 def get_variable_score(d, text):
+    '''
+    Given a dictionary of variable scores and a text,
+    extracts the variables and determines the average
+    variable score
+    
+    Inputs:
+        d (dictionary): the dictionary containing the variable scores
+            scores
+        text (str): the string of the text of the file to compare
+
+    Returns:
+        score (float)
+    '''
    variables = re.findall(REG_V, text)
    num = len(variables)
    total = 0
@@ -33,6 +46,19 @@ def get_variable_score(d, text):
    return total / num

 def get_function_score(d, text):
+    '''
+    Given a dictionary of function scores and a text,
+    extracts the functions and determines the average
+    function score
+    
+    Inputs:
+        d (dictionary): the dictionary containing the function scores
+            scores
+        text (str): the string of the text of the file to compare
+
+    Returns:
+        score (float)
+    '''
    functions = re.findall(REG_F, text)
    num = len(functions)
    total = 0