Commit e98258a9 by Anselm Jia

Merge branch 'master' of mit.cs.uchicago.edu:ethanmertz/CS-123-Final

parents c7069847 9e5e5e2b
Showing with 97 additions and 22 deletions
from mrjob.job import MRJob
import util
import heapq
import jellyfish
import json
import re
#Sek k of top-k entries to collect.
CAPACITY = 10
class TopText(MRJob):
def mapper_init(self):
def mapper_init(self):
#Opens .json files that have scores for functions/variables.
with open("var_map.json") as var:
var = var.read()
self.varible_score = json.loads(var)
with open("func_map.json") as func:
var = var.read()
self.function_score = json.loads(func)
with open("var_map.json") as var:
self.varible_score = json.load(var)
with open("func_map.json") as func:
self.function_score = json.load(func)
def mapper(self, _, line):
file1, file2, total = line.split(",")
file2name = "pyfile" + file2 + ".txt"
file2name = "pydata/pyfile" + file2 + ".txt"
for file_number in {file1, file2}: #If file1 == file2
filename = "pyfile" + file_number + ".txt"
file_text = filename.read()
for i in range(int(file1) + 1, int(total)):
#Get score for body text.
comparison_file = "pyfile" + str(i) + ".txt"
comparison_text = comparison_file.read()
ts = -jellyfish.jaro_winkler(filename_text,comparison_text)
for text in [(filename, file_text), (comparison_file, comparison_text)]:
#Gets scores for functions/variables computes total.
vs = -util.get_variable_score(self.varible_score, text[1])
fs = -util.get_function_score(self.function_score, text[1])
yield text[0], ts + vs + fs
filename = "pydata/pyfile" + file_number + ".txt"
with open(filename) as f1:
file_text = f1.read()
for i in range(int(file1) + 1, int(total)):
#Get score for body text.
comparison_file = "pydata/pyfile" + str(i) + ".txt"
with open(comparison_file) as comp:
comparison_text = comp.read()
ts = -jellyfish.jaro_winkler(file_text,comparison_text)
for text in [(filename, file_text), (comparison_file,
comparison_text)]:
#Gets scores for functions/variables computes total.
vs = -get_variable_score(self.varible_score, text[1])
fs = -get_function_score(self.function_score, text[1])
yield text[0], ts + vs + fs
def combiner(self, file, scores):
#Accumulates totals within node.
......@@ -67,5 +68,78 @@ class TopText(MRJob):
#Yield inverse again so we have smallest values.
yield item[1], -item[0]
REG_V = "[\s]*([a-zA-Z_][\w, ]*)[\s]*=[^=]"
REG_F = "def [a-zA-Z_][\w]*\("
DEFAULT_RETURN = 1
def funcsim(file1name,file2name):
'''
Given two function files generated by pyfuncsplit.py, we get
the parameters of these functions.
Inputs:
file1name, file2name (strings): files to be read.
Output:
tuple with name, parameters, and text of files.
'''
with open(file1name) as f1:
with open(file2name) as f2:
f1text = f1.read()
f2text = f2.read()
name1, params1, text1 = f1text.split("|",2)
name2, params2, text2 = f2text.split("|",2)
return (name1, name2, params1, params2, text1, text2)
def get_variable_score(d, text):
'''
Given a dictionary of variable scores and a text,
extracts the variables and determines the average
variable score
Inputs:
d (dictionary): the dictionary containing the variable scores
scores
text (str): the string of the text of the file to compare
Returns:
score (float)
'''
variables = re.findall(REG_V, text)
num = len(variables)
total = 0
for variable in variables:
total += d.get(variable, DEFAULT_RETURN)
if total:
score = total / num
else:
score = 0
return score
def get_function_score(d, text):
'''
Given a dictionary of function scores and a text,
extracts the functions and determines the average
function score
Inputs:
d (dictionary): the dictionary containing the function scores
scores
text (str): the string of the text of the file to compare
Returns:
score (float)
'''
functions = re.findall(REG_F, text)
num = len(functions)
total = 0
for function in functions:
total += d.get(function, DEFAULT_RETURN)
if num:
score = total / num
else:
score = 0
return score
if __name__ == '__main__':
TopText.run()
\ No newline at end of file
......@@ -66,7 +66,7 @@ def get_function_score(d, text):
functions = re.findall(REG_F, text)
num = len(functions)
total = 0
for functions in functions:
for function in functions:
total += d.get(function, DEFAULT_RETURN)
return total / num
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment