Commit 3df6560b by Anselm Jia

subsetting

parent d4df4096
......@@ -8,7 +8,7 @@ def go():
current_func = ""
count = 0
for line in f:
if count < 1000000:
if count < 20:
if not func:
result = re.findall("^([\t ]*)def (\w+)\(([\w, ]*)\):",line)
if result:
......
......@@ -9,10 +9,10 @@ class TopFunctions(MRJob):
def mapper(self, _, line):
file1, file2, total = line.split(",")
file1name = "pyfile" + file1 + ".txt"
file2name = "pyfile" + file2 + ".txt"
file1name = "pyfunctions/pyfile" + file1 + ".txt"
file2name = "pyfunctions/pyfile" + file2 + ".txt"
for i in range(int(file1) + 1, int(total)):
compare1 = "pyfile" + str(i) + ".txt"
compare1 = "pyfunctions/pyfile" + str(i) + ".txt"
params = util.funcsim(file1name, compare1)
name1 = params[0]
name2 = params[1]
......@@ -21,7 +21,7 @@ class TopFunctions(MRJob):
yield name2, (1, total, simscore)
if file1 != file2:
for i in range(int(file2) + 1, int(total)):
compare1 = "pyfile" + str(i) + ".txt"
compare1 = "pyfunctions/pyfile" + str(i) + ".txt"
params = util.funcsim(file1name, compare1)
name1 = params[0]
name2 = params[1]
......
......@@ -51,43 +51,43 @@ VAR_EX = ("(?:(?:auto\s*|const\s*|unsigned\s*|"
"double\s*|_Bool\s*|complex\s*)+)(?:\s+\*?\*?\s*)("
"[a-zA-Z_][a-zA-Z0-9_]*)\s*[\[;,=)]")
def file_metrics(fname):
with open(fname) as f:
f_l = f.readlines()
length = 0
func = 0
var = 0
for line in f_l:
var += len(re.findall(VAR_EX, line))
func += len(re.findall(FUNC_EX, line))
length += 1
#print(' '.join([str(length),str(func),str(var)]))
return length, func, var
def file_metrics(fname):
with open(fname) as f:
f_l = f.readlines()
length = 0
func = 0
var = 0
for line in f_l:
var += len(re.findall(VAR_EX, line))
func += len(re.findall(FUNC_EX, line))
length += 1
#print(' '.join([str(length),str(func),str(var)]))
return length, func, var
def sim(file1, file2):
print(file1 + ' ' + file2)
len_f1, num_func1, num_var1 = file_metrics(file1)
len_f2, num_func2, num_var2 = file_metrics(file2)
with open(file1) as f1:
with open(file2) as f2:
jw = jellyfish.jaro_distance(f1.read(), f2.read())
print(jw)
min_len = min(len_f1,len_f2)
max_len = max(len_f1,len_f2)
min_func = min(num_func1,num_func2)
max_func = max(num_func1,num_func2)
min_var = min(num_var1,num_var2)
max_var = max(num_var1,num_var2)
if max_func != 0:
r_func = min_func/max_func
else:
r_func = 0
if max_var != 0:
r_var = min_var/max_var
else:
r_var = 0
if max_len != 0:
r_len = min_len/max_len
else:
r_var = 0
return jw + r_func + r_var + r_len
def sim(file1, file2):
print(file1 + ' ' + file2)
len_f1, num_func1, num_var1 = file_metrics(file1)
len_f2, num_func2, num_var2 = file_metrics(file2)
with open(file1) as f1:
with open(file2) as f2:
jw = jellyfish.jaro_distance(f1.read(), f2.read())
print(jw)
min_len = min(len_f1,len_f2)
max_len = max(len_f1,len_f2)
min_func = min(num_func1,num_func2)
max_func = max(num_func1,num_func2)
min_var = min(num_var1,num_var2)
max_var = max(num_var1,num_var2)
if max_func != 0:
r_func = min_func/max_func
else:
r_func = 0
if max_var != 0:
r_var = min_var/max_var
else:
r_var = 0
if max_len != 0:
r_len = min_len/max_len
else:
r_var = 0
return jw + r_func + r_var + r_len
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment