Commit f411ae04 by Ethan Mertz

fixed top text and util to work

parent 12efb152
Showing with 41 additions and 18 deletions
...@@ -2,32 +2,33 @@ from mrjob.job import MRJob ...@@ -2,32 +2,33 @@ from mrjob.job import MRJob
import util import util
import heapq import heapq
import jellyfish import jellyfish
import json
CAPACITY = 100 CAPACITY = 100
class MRWordFreqCount(MRJob): class MRWordFreqCount(MRJob):
def mapper_init(self):
with open("var_map.json") as var:
self.varible_score = json.reads(var)
with open("func_map.json") as func:
self.function_score = json.reads(func)
def mapper(self, _, line): def mapper(self, _, line):
file1, file2, total = line.split(",") file1, file2, total = line.split(",")
file1name = "pyfile" + file1 + ".txt"
file2name = "pyfile" + file2 + ".txt" file2name = "pyfile" + file2 + ".txt"
for i in range(int(file1) + 1, int(total)): for file_number in {file1, file2}:
compare1 = "pyfile" + str(i) + ".txt" filename = "pyfile" + file_number + ".txt"
params = util.funcsim(file1name, compare1) file_text = filename.read()
text1 = params[4] for i in range(int(file1) + 1, int(total)):
text2 = params[5] comparison_file = "pyfile" + i + ".txt"
simscore = jellyfish.jaro_distance(text1, text2) comparison_text = comparison_file.read()
yield file1name, simscore ts = jellyfish.jaro_winkler(filename_text,comparison_text)
yield compare1, simscore for text in [(filename, file_text), (comparison_file, comparison_text)]:
if file1 != file2: vs = util.get_variable_score(self.varible_score, text[1])
for i in range(int(file2) + 1, int(total)): fs = util.get_function_score(self.function_score, text[1])
compare1 = "pyfile" + str(i) + ".txt" yield text[0], ts + vs + fs
params = util.funcsim(file1name, compare1)
text1 = params[4]
text2 = params[5]
simscore = jellyfish.jaro_distance(text1, text2)
yield file1name, simscore
yield compare1, simscore
def combiner(self, word, counts): def combiner(self, word, counts):
cs = list(counts) cs = list(counts)
......
import re import re
import jellyfish import jellyfish
REG_V = "[\s]*([a-zA-Z_][\w, ]*)[\s]*=[^=]"
REG_F = "def [a-zA-Z_][\w]*\("
DEFAULT_RETURN = 1
def funcsim(file1name,file2name): def funcsim(file1name,file2name):
count = 0 count = 0
with open(file1name) as f1: with open(file1name) as f1:
...@@ -12,6 +16,24 @@ def funcsim(file1name,file2name): ...@@ -12,6 +16,24 @@ def funcsim(file1name,file2name):
return (name1, name2, params1, params2, text1, text2) return (name1, name2, params1, params2, text1, text2)
def get_variable_score(d, text):
variables = re.findall(REG_V, text)
num = len(variables)
total = 0
for variable in variables:
total += d.get(variable, DEFAULT_RETURN)
return total / num
def get_function_score(d, text):
functions = re.findall(REG_F, text)
num = len(functions)
total = 0
for functions in functions:
total += d.get(function, DEFAULT_RETURN)
return total / num
''' '''
Functions for C Functions for C
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment