Commit 454363b4 by Ethan Mertz

pushing latest version

parent be5fa434
......@@ -43,8 +43,8 @@ def go(subset):
current_file.append(line)
#Writes out the last file to the directory.
with open('pydata/pyfile' + str(count) +".txt", 'w') as new:
for l in current_file:
new.write(l)
for l in current_file:
new.write(l)
if __name__ == "__main__":
subset = int(sys.argv[1])
......
......@@ -23,7 +23,10 @@ class TopText(MRJob):
filename = "pyfile" + file_number + ".txt"
file_text = filename.read()
for i in range(int(file1) + 1, int(total)):
#Get score for body text.
++--
Python/pyfuncsplit.py | 24 +++++-
Python/topfunctions.py | 22 ++++--
Python/toptext.py | 30 ++++--#Get score for body text.
comparison_file = "pyfile" + i + ".txt"
comparison_text = comparison_file.read()
ts = -jellyfish.jaro_winkler(filename_text,comparison_text)
......
......@@ -4,7 +4,7 @@ import heapq
import jellyfish
import math
CAPACITY = 200
CAPACITY = 500
VARIABLE_REGEX = ("[\s]*([\w, ]+)[\s]*=")
......
......@@ -5,13 +5,13 @@ Graphics subdirectory: Contains all graphics from the report.
Non-code subdirectory: Contains the project proposal file and the presentation slides.
Python subdirectory: Contains the files used for our final Python analysis
-collectvariables.py: Pulls all of the unique variables from a code text file.
-pyfilesplit.py: Splits out the desired number of program files from the raw file.
-pyfuncsplit.py: Splits out the desired number of functions files from the raw file.
-topfunctions.py: Find the function names with the lowest mean edit distance from all other function names.
-toptext.py: Computes the least unique files overall.
-topvariables.py: Computes the least unique variable names.
-topvariables_intersection.py: Checks which variable files have the largest mean number of common variables with other files.
-collectvariables.py: Pulls all of the unique variables from a code text file.
-pyfilesplit.py: Splits out the desired number of program files from the raw file.
-pyfuncsplit.py: Splits out the desired number of functions files from the raw file.
-topfunctions.py: Find the function names with the lowest mean edit distance from all other function names.
-toptext.py: Computes the least unique files overall.
-topvariables.py: Computes the least unique variable names.
-topvariables_intersection.py: Checks which variable files have the largest mean number of common variables with other files.
histogram.py: Used to construct the histograms for the report.
......
import re
d = {}
with open("python.txt") as p:
with open("plot_graph.py") as p:
for line in p:
var = re.findall("[\s]*([\w, ]+)[\s]*=")
d[var] = d.get(var, 0) + 1
var = re.findall("[\s]*([\w, ]+)[\s]*=[^=]", line)
if var:
d[var[0].strip()] = d.get(var[0].strip(), 0) + 1
l = []
for item in d:
l.append((d[item], item))
......
......@@ -9,12 +9,12 @@ COM="$2 --jobconf mapreduce.job.reduces=1 "
COUNTER=0
for i in `seq 0 $1`
do
COM="$COM--file pyfunctions/pyfile$i.txt "
COM="$COM--file pyvariables/varfile$i.txt "
done
COM="$COM index.txt"
echo $COM
python3 $COM > results.txt
python3 $COM > variable_scores.txt
......@@ -25,6 +25,19 @@ def funcsim(file1name,file2name):
return (name1, name2, params1, params2, text1, text2)
def get_variable_score(d, text):
'''
Given a dictionary of variable scores and a text,
extracts the variables and determines the average
variable score
Inputs:
d (dictionary): the dictionary containing the variable scores
scores
text (str): the string of the text of the file to compare
Returns:
score (float)
'''
variables = re.findall(REG_V, text)
num = len(variables)
total = 0
......@@ -33,6 +46,19 @@ def get_variable_score(d, text):
return total / num
def get_function_score(d, text):
'''
Given a dictionary of function scores and a text,
extracts the functions and determines the average
function score
Inputs:
d (dictionary): the dictionary containing the function scores
scores
text (str): the string of the text of the file to compare
Returns:
score (float)
'''
functions = re.findall(REG_F, text)
num = len(functions)
total = 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment