text edits

9e5e5e2b · Anselm Jia · 893e90e8 · 9e5e5e2b · 9e5e5e2b
Commit 9e5e5e2b authored Jun 04, 2018 by Anselm Jia
Showing with 97 additions and 22 deletions
Python/toptext.py
util.py
--- a/Python/toptext.py
+++ b/Python/toptext.py
 from mrjob.job import MRJob
-import util
 import heapq
 import jellyfish
 import json
+import re

 #Sek k of top-k entries to collect.
 CAPACITY = 10

 class TopText(MRJob):

-	def mapper_init(self):
+    def mapper_init(self):
        #Opens .json files that have scores for functions/variables.
-		with open("var_map.json") as var:
-            var = var.read()
-			self.varible_score = json.loads(var)
-		with open("func_map.json") as func:
-            var = var.read()
-			self.function_score = json.loads(func)
+        with open("var_map.json") as var:
+            self.varible_score = json.load(var)
+        with open("func_map.json") as func:
+            self.function_score = json.load(func)

    def mapper(self, _, line):
        file1, file2, total = line.split(",")
-        file2name = "pyfile" + file2 + ".txt"
+        file2name = "pydata/pyfile" + file2 + ".txt"
        for file_number in {file1, file2}: #If file1 == file2
-        	filename = "pyfile" + file_number + ".txt"
-        	file_text = filename.read()
-        	for i in range(int(file1) + 1, int(total)):
-                #Get score for body text.
-        		comparison_file = "pyfile" + str(i) + ".txt"
-        		comparison_text = comparison_file.read()
-        		ts = -jellyfish.jaro_winkler(filename_text,comparison_text)
-        		for text in [(filename, file_text), (comparison_file, comparison_text)]:
-                    #Gets scores for functions/variables computes total.
-        			vs = -util.get_variable_score(self.varible_score, text[1])
-        			fs = -util.get_function_score(self.function_score, text[1])
-        			yield text[0], ts + vs + fs
+            filename = "pydata/pyfile" + file_number + ".txt"
+            with open(filename) as f1:
+                file_text = f1.read()
+                for i in range(int(file1) + 1, int(total)):
+                    #Get score for body text.
+                    comparison_file = "pydata/pyfile" + str(i) + ".txt"
+                    with open(comparison_file) as comp:
+                        comparison_text = comp.read()
+                        ts = -jellyfish.jaro_winkler(file_text,comparison_text)
+                        for text in [(filename, file_text), (comparison_file,
+                            comparison_text)]:
+                            #Gets scores for functions/variables computes total.
+                            vs = -get_variable_score(self.varible_score, text[1])
+                            fs = -get_function_score(self.function_score, text[1])
+                            yield text[0], ts + vs + fs
                
    def combiner(self, file, scores):
        #Accumulates totals within node.
@@ -67,5 +68,78 @@ class TopText(MRJob):
            #Yield inverse again so we have smallest values.
            yield item[1], -item[0]

+REG_V = "[\s]*([a-zA-Z_][\w, ]*)[\s]*=[^=]"
+REG_F = "def [a-zA-Z_][\w]*\("
+DEFAULT_RETURN = 1
+
+def funcsim(file1name,file2name):
+    '''
+    Given two function files generated by pyfuncsplit.py, we get
+    the parameters of these functions.
+
+    Inputs:
+        file1name, file2name (strings): files to be read.
+    Output:
+        tuple with name, parameters, and text of files.
+    '''
+    with open(file1name) as f1:
+        with open(file2name) as f2:
+            f1text = f1.read()
+            f2text = f2.read()
+            name1, params1, text1 = f1text.split("|",2)
+            name2, params2, text2 = f2text.split("|",2)
+
+    return (name1, name2, params1, params2, text1, text2)
+
+def get_variable_score(d, text):
+    '''
+    Given a dictionary of variable scores and a text,
+    extracts the variables and determines the average
+    variable score
+    
+    Inputs:
+        d (dictionary): the dictionary containing the variable scores
+            scores
+        text (str): the string of the text of the file to compare
+
+    Returns:
+        score (float)
+    '''
+    variables = re.findall(REG_V, text)
+    num = len(variables)
+    total = 0
+    for variable in variables:
+        total += d.get(variable, DEFAULT_RETURN)
+    if total:
+        score = total / num
+    else:
+        score = 0
+    return score
+
+def get_function_score(d, text):
+    '''
+    Given a dictionary of function scores and a text,
+    extracts the functions and determines the average
+    function score
+    
+    Inputs:
+        d (dictionary): the dictionary containing the function scores
+            scores
+        text (str): the string of the text of the file to compare
+
+    Returns:
+        score (float)
+    '''
+    functions = re.findall(REG_F, text)
+    num = len(functions)
+    total = 0
+    for function in functions:
+        total += d.get(function, DEFAULT_RETURN)
+    if num:
+        score = total / num
+    else:
+        score = 0
+    return score
+
 if __name__ == '__main__':
  TopText.run()
\ No newline at end of file
--- a/util.py
+++ b/util.py
@@ -66,7 +66,7 @@ def get_function_score(d, text):
    functions = re.findall(REG_F, text)
    num = len(functions)
    total = 0
-    for functions in functions:
+    for function in functions:
        total += d.get(function, DEFAULT_RETURN)
    return total / num