results

79da2cf5 · Anselm Jia · 664704fd · 79da2cf5 · 79da2cf5 · 79da2cf5
Commit 79da2cf5 authored Jun 04, 2018 by Anselm Jia
Showing with 86 additions and 11 deletions
Python/topfunctext.py
Python/toptext.py
Results/funcscores.json → Results/func_map.json
Results/var_map.json
variable_scores.txt → Results/variable_scores.txt
util.py
--- a/Python/topfunctext.py
+++ b/Python/topfunctext.py
+from mrjob.job import MRJob
+import util
+import heapq
+import jellyfish
+import json
+#Sek k of top-k entries to collect.
+CAPACITY = 100
+class TopFuncText(MRJob):
+    def mapper_init(self):
+        #Opens .json file for variables.
+        with open("var_map.json") as var:
+            var = var.read()
+            self.varible_score = json.loads(var)
+    def mapper(self, _, line):
+        file1, file2, total = line.split(",")
+        for file_number in {file1, file2}: #If file1 == file2
+            filename = "pyfile" + file_number + ".txt"
+            for i in range(int(file1) + 1, int(total)):
+                #Get score for body text.
+                comparison_file = "pyfile" + str(i) + ".txt"
+                sims = util.funcsim(filename, comparison_file)
+                text1 = sims[4]
+                text2 = sims[5]
+                parameters1 = sims[2]
+                parameters2 = sims[3]
+                ts = -jellyfish.jaro_winkler(text1, text2)
+                for text in [(filename, text1, parameters1), (comparison_file,
+                    text2, parameters2)]:
+                    #Gets scores for variables/parameters and computes total.
+                    vs = -util.get_variable_score(self.varible_score, text[1])
+                    ps = -util.get_variable_score(self.varible_score, text[2])
+                    yield text[0], ts + vs + ps
+    def combiner(self, file, scores):
+        #Accumulates totals within node.
+        cs = list(scores)
+        count = sum(cs)
+        num = len(cs)
+        yield file, (num, count)
+    def reducer_init(self):
+        #Initialize with -inf in order to make sure the placeholder
+        #is dropped.
+        self.h = [(-float("inf"), "")]
+        heapq.heapify(self.h)
+    def reducer(self, word, counts):
+        #Compute totals and means.
+        total_count = 0
+        total_num = 0
+        for count in counts:
+            total_count += count[1]
+            total_num += count[0]
+        avg = total_count / total_num
+        if len(self.h) < CAPACITY:
+            heapq.heappush(self.h, (avg, word))
+        else:
+            heapq.heappushpop(self.h, (avg, word))
+    def reducer_final(self):
+        for i in range(len(self.h)):
+            item = heapq.heappop(self.h)
+            #Yield inverse again so we have smallest values.
+            yield item[1], -item[0]
+if __name__ == '__main__':
+  TopFuncText.run()
\ No newline at end of file
--- a/Python/toptext.py
+++ b/Python/toptext.py
@@ -5,16 +5,18 @@ import jellyfish
 import json
 #Sek k of top-k entries to collect.
-CAPACITY = 100
+CAPACITY = 10
 class TopText(MRJob):
 	def mapper_init(self):
        #Opens .json files that have scores for functions/variables.
 		with open("var_map.json") as var:
-			self.varible_score = json.reads(var)
+            var = var.read()
+			self.varible_score = json.loads(var)
 		with open("func_map.json") as func:
-			self.function_score = json.reads(func)
+            var = var.read()
+			self.function_score = json.loads(func)
    def mapper(self, _, line):
        file1, file2, total = line.split(",")
@@ -23,11 +25,8 @@ class TopText(MRJob):
        	filename = "pyfile" + file_number + ".txt"
        	file_text = filename.read()
        	for i in range(int(file1) + 1, int(total)):
-                ++--
+                #Get score for body text.
- Python/pyfuncsplit.py                             |  24 +++++-
+        		comparison_file = "pyfile" + str(i) + ".txt"
- Python/topfunctions.py                            |  22 ++++--
- Python/toptext.py                                 |  30 ++++--#Get score for body text.
-        		comparison_file = "pyfile" + i + ".txt"
        		comparison_text = comparison_file.read()
        		ts = -jellyfish.jaro_winkler(filename_text,comparison_text)
        		for text in [(filename, file_text), (comparison_file, comparison_text)]:
@@ -69,4 +68,4 @@ class TopText(MRJob):
            yield item[1], -item[0]
 if __name__ == '__main__':
-  MRWordFreqCount.run()
+  TopText.run()
\ No newline at end of file
--- a/Results/funcscores.json
+++ b/Results/funcscores.json
--- a/Results/var_map.json
+++ b/Results/var_map.json
--- a/variable_scores.txt
+++ b/variable_scores.txt
--- a/util.py
+++ b/util.py
@@ -43,7 +43,11 @@ def get_variable_score(d, text):
    total = 0
    for variable in variables:
        total += d.get(variable, DEFAULT_RETURN)
-    return total / num
+    if total:
+        score = total / num
+    else:
+        score = 0
+    return score
 def get_function_score(d, text):
    '''