Commit 79da2cf5 by Anselm Jia

results

parent 664704fd
from mrjob.job import MRJob
import util
import heapq
import jellyfish
import json
#Sek k of top-k entries to collect.
CAPACITY = 100
class TopFuncText(MRJob):
def mapper_init(self):
#Opens .json file for variables.
with open("var_map.json") as var:
var = var.read()
self.varible_score = json.loads(var)
def mapper(self, _, line):
file1, file2, total = line.split(",")
for file_number in {file1, file2}: #If file1 == file2
filename = "pyfile" + file_number + ".txt"
for i in range(int(file1) + 1, int(total)):
#Get score for body text.
comparison_file = "pyfile" + str(i) + ".txt"
sims = util.funcsim(filename, comparison_file)
text1 = sims[4]
text2 = sims[5]
parameters1 = sims[2]
parameters2 = sims[3]
ts = -jellyfish.jaro_winkler(text1, text2)
for text in [(filename, text1, parameters1), (comparison_file,
text2, parameters2)]:
#Gets scores for variables/parameters and computes total.
vs = -util.get_variable_score(self.varible_score, text[1])
ps = -util.get_variable_score(self.varible_score, text[2])
yield text[0], ts + vs + ps
def combiner(self, file, scores):
#Accumulates totals within node.
cs = list(scores)
count = sum(cs)
num = len(cs)
yield file, (num, count)
def reducer_init(self):
#Initialize with -inf in order to make sure the placeholder
#is dropped.
self.h = [(-float("inf"), "")]
heapq.heapify(self.h)
def reducer(self, word, counts):
#Compute totals and means.
total_count = 0
total_num = 0
for count in counts:
total_count += count[1]
total_num += count[0]
avg = total_count / total_num
if len(self.h) < CAPACITY:
heapq.heappush(self.h, (avg, word))
else:
heapq.heappushpop(self.h, (avg, word))
def reducer_final(self):
for i in range(len(self.h)):
item = heapq.heappop(self.h)
#Yield inverse again so we have smallest values.
yield item[1], -item[0]
if __name__ == '__main__':
TopFuncText.run()
\ No newline at end of file
...@@ -5,16 +5,18 @@ import jellyfish ...@@ -5,16 +5,18 @@ import jellyfish
import json import json
#Sek k of top-k entries to collect. #Sek k of top-k entries to collect.
CAPACITY = 100 CAPACITY = 10
class TopText(MRJob): class TopText(MRJob):
def mapper_init(self): def mapper_init(self):
#Opens .json files that have scores for functions/variables. #Opens .json files that have scores for functions/variables.
with open("var_map.json") as var: with open("var_map.json") as var:
self.varible_score = json.reads(var) var = var.read()
self.varible_score = json.loads(var)
with open("func_map.json") as func: with open("func_map.json") as func:
self.function_score = json.reads(func) var = var.read()
self.function_score = json.loads(func)
def mapper(self, _, line): def mapper(self, _, line):
file1, file2, total = line.split(",") file1, file2, total = line.split(",")
...@@ -23,11 +25,8 @@ class TopText(MRJob): ...@@ -23,11 +25,8 @@ class TopText(MRJob):
filename = "pyfile" + file_number + ".txt" filename = "pyfile" + file_number + ".txt"
file_text = filename.read() file_text = filename.read()
for i in range(int(file1) + 1, int(total)): for i in range(int(file1) + 1, int(total)):
++-- #Get score for body text.
Python/pyfuncsplit.py | 24 +++++- comparison_file = "pyfile" + str(i) + ".txt"
Python/topfunctions.py | 22 ++++--
Python/toptext.py | 30 ++++--#Get score for body text.
comparison_file = "pyfile" + i + ".txt"
comparison_text = comparison_file.read() comparison_text = comparison_file.read()
ts = -jellyfish.jaro_winkler(filename_text,comparison_text) ts = -jellyfish.jaro_winkler(filename_text,comparison_text)
for text in [(filename, file_text), (comparison_file, comparison_text)]: for text in [(filename, file_text), (comparison_file, comparison_text)]:
...@@ -69,4 +68,4 @@ class TopText(MRJob): ...@@ -69,4 +68,4 @@ class TopText(MRJob):
yield item[1], -item[0] yield item[1], -item[0]
if __name__ == '__main__': if __name__ == '__main__':
MRWordFreqCount.run() TopText.run()
\ No newline at end of file \ No newline at end of file
...@@ -43,7 +43,11 @@ def get_variable_score(d, text): ...@@ -43,7 +43,11 @@ def get_variable_score(d, text):
total = 0 total = 0
for variable in variables: for variable in variables:
total += d.get(variable, DEFAULT_RETURN) total += d.get(variable, DEFAULT_RETURN)
return total / num if total:
score = total / num
else:
score = 0
return score
def get_function_score(d, text): def get_function_score(d, text):
''' '''
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment