Commit 8d16ddb7 by Anselm Jia

variables

parent 4c461b2d
......@@ -2,7 +2,7 @@ import re
import jellyfish
import glob
var = "[\s]*([\w, ]+)[\s]*=[^=]"
var = "[\s]*([a-zA-Z_][\w, ]*)[\s]*=[^=]"
def get_unique_vars(filename):
unique_vs = set()
......@@ -21,7 +21,8 @@ def main():
print("working on " + filename)
s = get_unique_vars(filename)
with open("pyvariables/varfile" + str(count) + ".txt", "w") as f:
f.write(repr(s))
for var in s:
f.write(var + "\n")
count += 1
......
......@@ -16,7 +16,7 @@ class MRWordFreqCount(MRJob):
params = util.funcsim(file1name, compare1)
text1 = params[4]
text2 = params[5]
simscore = jellyfish.jaro_distance(text1, text2)
simscore = jellyfish.jaro_distance(text1, text2)
yield file1name, simscore
yield compare1, simscore
if file1 != file2:
......
......@@ -12,30 +12,38 @@ class TopVariables(MRJob):
def mapper(self, _, line):
file1, file2, total = line.split(",")
file1name = "pyfile" + file1 + ".txt"
file2name = "pyfile" + file2 + ".txt"
f1 = file1.readlines()
for i in range(int(file1) + 1, int(total)):
compare = "pyfile" + str(i) + ".txt"
compare1 = compare.readlines()
for variable in f1:
for comp in compare1:
simscore = -jellyfish.levenshtein_distance(variable, comp)
yield variable, (1, simscore)
yield comp, (1, simscore)
file1name = "varfile" + file1 + ".txt"
file2name = "varfile" + file2 + ".txt"
with open(file1name) as f1:
f1vars = f1.readlines()
for i in range(int(file1) + 1, int(total)):
compare = "varfile" + str(i) + ".txt"
with open(compare) as compare1:
compare1vars = compare1.readlines()
for variable in f1vars:
variable = variable.strip()
for comp in compare1vars:
comp = comp.strip()
simscore = -jellyfish.levenshtein_distance(variable, comp)
yield variable, (1, simscore)
yield comp, (1, simscore)
if file1 != file2:
f2 = file2.readlines()
for i in range(int(file2) + 1, int(total)):
compare = "pyfile" + str(i) + ".txt"
compare2 = compare.readlines()
for variable in f2:
for comp in compare2:
simscore = -jellyfish.levenshtein_distance(variable, comp)
yield variable, (1, simscore)
yield comp, (1, simscore)
yield None, (len(f1) + len(f2))
with open(file2name) as f2:
f2vars = f2.readlines()
for i in range(int(file2) + 1, int(total)):
compare = "varfile" + str(i) + ".txt"
with open(compare) as compare2:
compare2vars = compare2.readlines()
for variable in f2:
variable = variable.strip()
for comp in compare2vars:
comp = comp.strip()
simscore = -jellyfish.levenshtein_distance(variable, comp)
yield variable, (1, simscore)
yield comp, (1, simscore)
yield None, (len(f1vars) + len(f2vars))
else:
yield None, len(f1)
yield None, len(f1vars)
def combiner(self, name, scores):
......@@ -53,7 +61,7 @@ class TopVariables(MRJob):
def reducer_init(self):
self.h = [(-float("inf"), "")]
self.total
self.total = None
heapq.heapify(self.h)
def reducer(self, name, scores):
......
......@@ -17,6 +17,18 @@ def funcsim(file1name,file2name):
Functions for C
'''
FUNC_EX = ("(?:(?:auto\s*|const\s*|unsigned\s*|signed\s*|"
"register\s*|volatile\s*|static\s*|void\s*|short"
"\s*|long\s*|char\s*|int\s*|float\s*|double\s*|"
"_Bool\s*|complex\s*)+)(?:\s+\*?\*?\s*)([a-zA-Z"
"_][a-zA-Z0-9_]*) *\(")
VAR_EX = ("(?:(?:auto\s*|const\s*|unsigned\s*|"
"signed\s*|register\s*|volatile\s*|static\s*|"
"void\s*|short\s*|long\s*|char\s*|int\s*|float\s*|"
"double\s*|_Bool\s*|complex\s*)+)(?:\s+\*?\*?\s*)("
"[a-zA-Z_][a-zA-Z0-9_]*)\s*[\[;,=)]")
def file_metrics(fname):
with open(fname) as f:
f_l = f.readlines()
......@@ -30,18 +42,6 @@ Functions for C
#print(' '.join([str(length),str(func),str(var)]))
return length, func, var
FUNC_EX = ("(?:(?:auto\s*|const\s*|unsigned\s*|signed\s*|"
"register\s*|volatile\s*|static\s*|void\s*|short"
"\s*|long\s*|char\s*|int\s*|float\s*|double\s*|"
"_Bool\s*|complex\s*)+)(?:\s+\*?\*?\s*)([a-zA-Z"
"_][a-zA-Z0-9_]*) *\(")
VAR_EX = ("(?:(?:auto\s*|const\s*|unsigned\s*|"
"signed\s*|register\s*|volatile\s*|static\s*|"
"void\s*|short\s*|long\s*|char\s*|int\s*|float\s*|"
"double\s*|_Bool\s*|complex\s*)+)(?:\s+\*?\*?\s*)("
"[a-zA-Z_][a-zA-Z0-9_]*)\s*[\[;,=)]")
def sim(file1, file2):
print(file1 + ' ' + file2)
len_f1, num_func1, num_var1 = file_metrics(file1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment