Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Michelle Awh
/
project_kitty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
68a3a764
authored
Mar 12, 2021
by
Michelle Awh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
sultry gift for u chris
parent
1125f4b9
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
94 deletions
SearchResults.py
SearchResults.py
View file @
68a3a764
...
...
@@ -14,7 +14,6 @@ import re
import
urllib.parse
import
sys
import
unicodedata
<<<<<<<
HEAD
from
collections
import
Counter
import
nltk
nltk
.
download
(
'stopwords'
)
...
...
@@ -24,28 +23,6 @@ inappropriate_words_lst = inappropriate_words()
=======
def
keep_chr
(
ch
):
'''
Find all characters that are classifed as punctuation in Unicode
(except #, @, &) and combine them into a single string.
'''
return
unicodedata
.
category
(
ch
)
.
startswith
(
'P'
)
and
\
(
ch
not
in
(
"#"
,
"@"
,
"&"
))
PUNCTUATION
=
" "
.
join
([
chr
(
i
)
for
i
in
range
(
sys
.
maxunicode
)
if
keep_chr
(
chr
(
i
))])
STOP_WORDS
=
[
"a"
,
"an"
,
"the"
,
"this"
,
"that"
,
"of"
,
"for"
,
"or"
,
"and"
,
"on"
,
"to"
,
"be"
,
"if"
,
"we"
,
"you"
,
"in"
,
"is"
,
"at"
,
"it"
,
"rt"
,
"mt"
,
"with"
,
"t"
,
"don"
,
"wh"
]
STOP_PREFIXES
=
(
"@"
,
"#"
,
"http"
,
"&"
)
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
...
...
@@ -87,7 +64,6 @@ class SearchResults:
Returns:
a SearchResults object
'''
<<<<<<<
HEAD
self
.
__filters
=
filters
self
.
__required_words
=
[]
self
.
__required_words
+=
self
.
get_required_words
(
query
)
...
...
@@ -98,19 +74,6 @@ class SearchResults:
else
:
self
.
__query
=
query
self
.
__soup
=
self
.
get_search_soup
(
self
.
__query
)
=======
self
.
__query
=
query
self
.
__filters
=
filters
self
.
__required_words
=
self
.
get_required_words
(
query
)
if
self
.
__required_words
!=
None
:
self
.
__filters
[
'required_words'
]
=
self
.
__required_words
self
.
__forbidden_words
=
self
.
get_forbidden_words
(
query
)
if
self
.
__forbidden_words
!=
None
:
self
.
__filters
[
'forbidden_words'
]
=
self
.
__forbidden_words
self
.
__soup
=
self
.
get_search_soup
(
query
)
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
self
.
__links
=
[
a
[
'href'
]
for
a
in
self
.
__soup
.
find_all
(
\
'a'
,
class_
=
'result_link'
)]
self
.
__results
=
self
.
__soup
.
find_all
(
'a'
,
class_
=
'result_link'
)
...
...
@@ -121,17 +84,12 @@ class SearchResults:
self
.
__outer_articles
=
[
r
for
r
in
self
.
__unfiltered_outer_articles
\
if
self
.
passes_outer_filters
(
r
)]
self
.
__unfiltered_articles
=
[
Article
(
r
.
url
)
for
r
in
self
.
__outer_articles
]
<<<<<<<
HEAD
self
.
articles
=
[]
for
a
in
self
.
__unfiltered_articles
:
if
self
.
passes_all_filters
(
a
):
self
.
articles
.
append
(
a
)
for
a
in
self
.
articles
:
a
.
related_enough
()
=======
self
.
articles
=
[
a
for
a
in
self
.
__unfiltered_articles
if
\
self
.
passes_all_filters
(
a
)]
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
def
get_search_soup
(
self
,
query
):
...
...
@@ -142,11 +100,6 @@ class SearchResults:
def
passes_outer_filters
(
self
,
outer_article
):
<<<<<<<
HEAD
print
(
self
.
__filters
)
print
(
'checking'
)
=======
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
if
'date_updated'
in
self
.
__filters
:
num
,
interval
=
outer_article
.
date_updated
.
split
()[:
2
]
num
=
int
(
num
)
...
...
@@ -207,16 +160,11 @@ class SearchResults:
if
self
.
__filters
[
'child_safe'
]:
self
.
__forbidden_words
+=
inappropriate_words_lst
if
'pct_helpful'
in
self
.
__filters
:
<<<<<<<
HEAD
try
:
if
int
(
article
.
pct_helpful
)
<
self
.
__filters
[
'pct_helpful'
]:
return
False
except
TypeError
:
pass
=======
if
article
.
pct_helpful
<
self
.
__filters
[
'pct_helpful'
]:
return
False
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
if
'num_voters'
in
self
.
__filters
:
if
int
(
article
.
num_voters
)
<
self
.
__filters
[
'num_voters'
]:
return
False
...
...
@@ -489,7 +437,6 @@ class Article:
links
.
append
(
filtered_link
)
return
links
<<<<<<<
HEAD
...
...
@@ -536,12 +483,9 @@ class Article:
self
.
related_articles
+=
actual_related_articles
self
.
related_categories
+=
actual_related_categories
=======
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
def
pre_processing
(
self
,
wh_page
,
caps
):
<<<<<<<
HEAD
tokenizer
=
nltk
.
RegexpTokenizer
(
r'\w+'
)
list_of_words
=
tokenizer
.
tokenize
(
wh_page
)
list_of_words
=
[
word
for
word
in
list_of_words
if
word
not
in
stopwords
.
words
(
'english'
)]
...
...
@@ -549,33 +493,6 @@ class Article:
def
n_gram
(
self
,
wh_page
,
caps
,
n
):
=======
list_of_words
=
wh_page
.
split
()
list_of_words
=
[
word
.
strip
(
PUNCTUATION
)
for
word
in
list_of_words
\
if
word
.
strip
(
PUNCTUATION
)
!=
''
]
if
not
caps
:
list_of_words
=
[
word
.
lower
()
for
word
in
list_of_words
]
list_of_words
=
[
word
for
word
in
list_of_words
if
word
not
in
STOP_WORDS
]
list_of_words
=
[
word
for
word
in
list_of_words
\
if
not
word
.
startswith
(
STOP_PREFIXES
)]
return
list_of_words
def
find_common_words
(
self
,
string_lst
):
words_lst
=
Counter
()
strng_lst
=
self
.
n_gram
(
string_lst
,
False
,
1
)
print
(
strng_lst
)
words_lst
.
update
(
w
for
w
in
strng_lst
)
print
(
words_lst
)
common_words
=
[
w
for
w
in
words_lst
.
most_common
(
15
)]
common_words
=
[
w
[
0
][
0
]
for
w
in
common_words
]
return
common_words
def
n_gram
(
self
,
wh_page
,
caps
,
n
):
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
n_gram_lst
=
[]
list_of_words
=
self
.
pre_processing
(
wh_page
,
caps
)
start_value
=
0
...
...
@@ -585,7 +502,6 @@ class Article:
start_value
+=
1
stop_value
+=
1
return
n_gram_lst
<<<<<<<
HEAD
def
find_common_words
(
self
,
string
):
...
...
@@ -595,8 +511,6 @@ class Article:
common_words
=
[
w
for
w
in
words_lst
.
most_common
(
15
)]
common_words
=
[
w
[
0
][
0
]
for
w
in
common_words
]
return
common_words
=======
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
'''
def find_salient_ngrams(self, wh_page, n, case_sensitive, threshold):
...
...
@@ -692,11 +606,3 @@ class Article:
else
:
info
+=
'
\n
No Sources Cited'
return
info
<<<<<<<
HEAD
def
test
():
print
(
inappropriate_words_lst
)
=======
>>>>>>>
d2692ff014493cfae8c975b6a9b16d55f549c9fb
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment