Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Ethan Mertz
/
CS-123-Final
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
9e5e5e2b
authored
Jun 04, 2018
by
Anselm Jia
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
text edits
parent
893e90e8
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
97 additions
and
22 deletions
Python/toptext.py
util.py
Python/toptext.py
View file @
9e5e5e2b
from
mrjob.job
import
MRJob
import
util
import
heapq
import
jellyfish
import
json
import
re
#Sek k of top-k entries to collect.
CAPACITY
=
10
class
TopText
(
MRJob
):
def
mapper_init
(
self
):
def
mapper_init
(
self
):
#Opens .json files that have scores for functions/variables.
with
open
(
"var_map.json"
)
as
var
:
var
=
var
.
read
()
self
.
varible_score
=
json
.
loads
(
var
)
with
open
(
"func_map.json"
)
as
func
:
var
=
var
.
read
()
self
.
function_score
=
json
.
loads
(
func
)
with
open
(
"var_map.json"
)
as
var
:
self
.
varible_score
=
json
.
load
(
var
)
with
open
(
"func_map.json"
)
as
func
:
self
.
function_score
=
json
.
load
(
func
)
def
mapper
(
self
,
_
,
line
):
file1
,
file2
,
total
=
line
.
split
(
","
)
file2name
=
"pyfile"
+
file2
+
".txt"
file2name
=
"py
data/py
file"
+
file2
+
".txt"
for
file_number
in
{
file1
,
file2
}:
#If file1 == file2
filename
=
"pyfile"
+
file_number
+
".txt"
file_text
=
filename
.
read
()
for
i
in
range
(
int
(
file1
)
+
1
,
int
(
total
)):
#Get score for body text.
comparison_file
=
"pyfile"
+
str
(
i
)
+
".txt"
comparison_text
=
comparison_file
.
read
()
ts
=
-
jellyfish
.
jaro_winkler
(
filename_text
,
comparison_text
)
for
text
in
[(
filename
,
file_text
),
(
comparison_file
,
comparison_text
)]:
#Gets scores for functions/variables computes total.
vs
=
-
util
.
get_variable_score
(
self
.
varible_score
,
text
[
1
])
fs
=
-
util
.
get_function_score
(
self
.
function_score
,
text
[
1
])
yield
text
[
0
],
ts
+
vs
+
fs
filename
=
"pydata/pyfile"
+
file_number
+
".txt"
with
open
(
filename
)
as
f1
:
file_text
=
f1
.
read
()
for
i
in
range
(
int
(
file1
)
+
1
,
int
(
total
)):
#Get score for body text.
comparison_file
=
"pydata/pyfile"
+
str
(
i
)
+
".txt"
with
open
(
comparison_file
)
as
comp
:
comparison_text
=
comp
.
read
()
ts
=
-
jellyfish
.
jaro_winkler
(
file_text
,
comparison_text
)
for
text
in
[(
filename
,
file_text
),
(
comparison_file
,
comparison_text
)]:
#Gets scores for functions/variables computes total.
vs
=
-
get_variable_score
(
self
.
varible_score
,
text
[
1
])
fs
=
-
get_function_score
(
self
.
function_score
,
text
[
1
])
yield
text
[
0
],
ts
+
vs
+
fs
def
combiner
(
self
,
file
,
scores
):
#Accumulates totals within node.
...
...
@@ -67,5 +68,78 @@ class TopText(MRJob):
#Yield inverse again so we have smallest values.
yield
item
[
1
],
-
item
[
0
]
REG_V
=
"[
\
s]*([a-zA-Z_][
\
w, ]*)[
\
s]*=[^=]"
REG_F
=
"def [a-zA-Z_][
\
w]*
\
("
DEFAULT_RETURN
=
1
def
funcsim
(
file1name
,
file2name
):
'''
Given two function files generated by pyfuncsplit.py, we get
the parameters of these functions.
Inputs:
file1name, file2name (strings): files to be read.
Output:
tuple with name, parameters, and text of files.
'''
with
open
(
file1name
)
as
f1
:
with
open
(
file2name
)
as
f2
:
f1text
=
f1
.
read
()
f2text
=
f2
.
read
()
name1
,
params1
,
text1
=
f1text
.
split
(
"|"
,
2
)
name2
,
params2
,
text2
=
f2text
.
split
(
"|"
,
2
)
return
(
name1
,
name2
,
params1
,
params2
,
text1
,
text2
)
def
get_variable_score
(
d
,
text
):
'''
Given a dictionary of variable scores and a text,
extracts the variables and determines the average
variable score
Inputs:
d (dictionary): the dictionary containing the variable scores
scores
text (str): the string of the text of the file to compare
Returns:
score (float)
'''
variables
=
re
.
findall
(
REG_V
,
text
)
num
=
len
(
variables
)
total
=
0
for
variable
in
variables
:
total
+=
d
.
get
(
variable
,
DEFAULT_RETURN
)
if
total
:
score
=
total
/
num
else
:
score
=
0
return
score
def
get_function_score
(
d
,
text
):
'''
Given a dictionary of function scores and a text,
extracts the functions and determines the average
function score
Inputs:
d (dictionary): the dictionary containing the function scores
scores
text (str): the string of the text of the file to compare
Returns:
score (float)
'''
functions
=
re
.
findall
(
REG_F
,
text
)
num
=
len
(
functions
)
total
=
0
for
function
in
functions
:
total
+=
d
.
get
(
function
,
DEFAULT_RETURN
)
if
num
:
score
=
total
/
num
else
:
score
=
0
return
score
if
__name__
==
'__main__'
:
TopText
.
run
()
\ No newline at end of file
util.py
View file @
9e5e5e2b
...
...
@@ -66,7 +66,7 @@ def get_function_score(d, text):
functions
=
re
.
findall
(
REG_F
,
text
)
num
=
len
(
functions
)
total
=
0
for
function
s
in
functions
:
for
function
in
functions
:
total
+=
d
.
get
(
function
,
DEFAULT_RETURN
)
return
total
/
num
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment