Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Michelle Awh
/
project_kitty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
41539005
authored
Mar 11, 2021
by
Michelle Awh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
GOT WORDS AND TAGS WORKING EYAHAHAHAHAHAHAHA
parent
97c58eda
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
26 additions
and
86 deletions
SearchResults.py
SearchResults.py
View file @
41539005
...
...
@@ -14,6 +14,7 @@ import re
import
urllib.parse
import
sys
import
unicodedata
from
collections
import
Counter
...
...
@@ -229,11 +230,8 @@ class Article:
self
.
rating
=
self
.
find_rating
(
self
.
__soup
)
self
.
languages
=
self
.
find_languages
(
self
.
__soup
)
self
.
view
=
self
.
__repr__
()
self
.
words
=
self
.
scrape_words
(
self
.
url
,
self
.
__soup
)
'''
self.salient = self.find_salient_ngrams(self.words,
\
1, False, 0.5)
'''
self
.
words
=
self
.
scrape_words
(
self
.
__soup
)
self
.
tags
=
self
.
find_common_words
(
self
.
words
)
def
get_page_soup
(
self
,
url
):
response
=
requests
.
get
(
url
)
...
...
@@ -324,7 +322,7 @@ class Article:
return
translations
def
scrape_words
(
self
,
url
,
soup
):
def
scrape_words
(
self
,
soup
):
"""
Take all the steps and descriptions from a given
wikihow articles and lump into one large string
...
...
@@ -332,18 +330,19 @@ class Article:
title
=
soup
.
find
(
"h1"
,
id
=
"section_0"
)
.
text
description
=
soup
.
find
(
"div"
,
class_
=
"mf-section-0"
)
.
text
steps
=
soup
.
find_all
(
"div"
,
class_
=
"step"
)
text
=
[]
main_text
=
[]
for
step
in
steps
:
s
=
str
(
step
)
match
=
re
.
search
(
'/b>(.*)<sup'
,
s
)
if
match
!=
None
:
main_text
.
append
(
match
.
group
(
1
))
top
=
title
+
description
top
=
re
.
findall
(
"[^
\n
][a-zA-Z']+"
,
top
)
for
y
in
top
:
y
=
y
.
strip
()
text
.
append
(
y
)
for
s
in
steps
:
t
=
re
.
findall
(
"[^
\n
][a-zA-Z']+"
,
s
.
text
)
for
x
in
t
:
x
=
x
.
strip
()
text
.
append
(
x
)
return
" "
.
join
(
text
)
print
(
"AAA"
)
text
=
[
y
.
strip
()
for
y
in
top
]
for
t
in
main_text
:
text
+=
t
.
split
()
return
' '
.
join
(
text
)
def
all_peripheral_links
(
self
,
soup
,
url
):
...
...
@@ -415,35 +414,23 @@ class Article:
links
.
append
(
filtered_link
)
return
links
'''
def find_salient_ngrams(self, wh_page, n, case_sensitive, threshold):
all_n_grams = []
ngram = n_gram_salient(wh_page,case_sensitive,n)
all_n_grams.append(ngram)
salient_ngrams = find_salient(all_n_grams,threshold)
return salient_ngrams
def pre_processing(self, wh_page,caps, salient):
def
pre_processing
(
self
,
wh_page
,
caps
):
list_of_words
=
wh_page
.
split
()
list_of_words
=
[
word
.
strip
(
PUNCTUATION
)
for
word
in
list_of_words
\
if
word
.
strip
(
PUNCTUATION
)
!=
''
]
if
not
caps
:
list_of_words
=
[
word
.
lower
()
for
word
in
list_of_words
]
if not salient: #if we want to do things like find top k or find min etc.
list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
list_of_words
=
[
word
for
word
in
list_of_words
if
word
not
in
STOP_WORDS
]
list_of_words
=
[
word
for
word
in
list_of_words
\
if
not
word
.
startswith
(
STOP_PREFIXES
)]
return
list_of_words
def
n_gram_salient
(
self
,
wh_page
,
caps
,
n
):
n_gram_lst
=
[]
list_of_words = pre_processing(wh_page,caps)
list_of_words
=
self
.
pre_processing
(
wh_page
,
caps
)
start_value
=
0
stop_value
=
n
while
stop_value
<=
len
(
list_of_words
):
...
...
@@ -452,63 +439,16 @@ class Article:
stop_value
+=
1
return
n_gram_lst
def
find_common_words
(
self
,
string
):
words_lst
=
Counter
()
string_lst
=
self
.
n_gram_salient
(
string
,
False
,
1
)
words_lst
.
update
(
w
for
w
in
string_lst
)
common_words
=
[
w
for
w
in
words_lst
.
most_common
(
15
)]
common_words
=
[
w
[
0
][
0
]
for
w
in
common_words
]
return
common_words
def calc_tf(self, a_token, tokens):
"""
calculates term frequency
"""
token_list = count_tokens(tokens)
max_term = max(token_list.values())
f_term = token_list[a_token]
tf = 0.5+0.5*(f_term/max_term)
return tf
def count_tokens(self, tokens):
token_dict = {}
for token in tokens:
if token not in token_dict:
token_dict[token] = 1
else:
token_dict[token] += 1
return token_dict
def calc_idf(self, docs,a_token):
"""
calculates inverse document frequency
"""
docs_with_t = 0
N = len(docs)
for lst in docs:
if a_token in lst:
docs_with_t +=1
idf = math.log(N/docs_with_t)
return idf
def find_salient(self, docs, threshold):
"""
Compute the salient words for each document. A word is salient if
its tf-idf score is strictly above a given threshold.
Inputs:
docs: list of list of tokens
threshold: float
Returns: list of sets of salient words
"""
salient_lst = []
for tok_lst in docs:
new_set = set()
for a_token in tok_lst:
tf_idf = calc_tf(a_token,tok_lst) * calc_idf(docs, a_token)
if tf_idf > threshold:
new_set.add(a_token)
salient_lst.append(new_set)
return salient_lst
'''
def
__repr__
(
self
):
info
=
self
.
title
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment