Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Michelle Awh
/
project_kitty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
e7dc01bb
authored
Mar 12, 2021
by
Alex Chang
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added some docstrings
parent
9eb0b792
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
13 additions
and
27 deletions
SR_alex.py
SR_alex.py
View file @
e7dc01bb
...
...
@@ -14,26 +14,11 @@ import re
import
urllib.parse
import
sys
import
unicodedata
import
nltk
from
nltk.corpus
import
stopwords
def
keep_chr
(
ch
):
'''
Find all characters that are classifed as punctuation in Unicode
(except #, @, &) and combine them into a single string.
'''
return
unicodedata
.
category
(
ch
)
.
startswith
(
'P'
)
and
\
(
ch
not
in
(
"#"
,
"@"
,
"&"
))
PUNCTUATION
=
" "
.
join
([
chr
(
i
)
for
i
in
range
(
sys
.
maxunicode
)
if
keep_chr
(
chr
(
i
))])
STOP_WORDS
=
[
"a"
,
"an"
,
"the"
,
"this"
,
"that"
,
"of"
,
"for"
,
"or"
,
"and"
,
"on"
,
"to"
,
"be"
,
"if"
,
"we"
,
"you"
,
"in"
,
"is"
,
"at"
,
"it"
,
"rt"
,
"mt"
,
"with"
,
"t"
,
"don"
,
"wh"
]
STOP_PREFIXES
=
(
"@"
,
"#"
,
"http"
,
"&"
)
...
...
@@ -409,14 +394,17 @@ class Article:
def
pre_processing
(
self
,
wh_page
,
caps
):
list_of_words
=
wh_page
.
split
()
list_of_words
=
[
word
.
strip
(
PUNCTUATION
)
for
word
in
list_of_words
\
if
word
.
strip
(
PUNCTUATION
)
!=
''
]
if
not
caps
:
list_of_words
=
[
word
.
lower
()
for
word
in
list_of_words
]
list_of_words
=
[
word
for
word
in
list_of_words
if
word
not
in
STOP_WORDS
]
list_of_words
=
[
word
for
word
in
list_of_words
\
if
not
word
.
startswith
(
STOP_PREFIXES
)]
#list_of_words = wh_page.split()
tokenizer
=
nltk
.
RegexpTokenizer
(
r"\w+"
)
list_of_words
=
tokenizer
.
tokenize
(
wh_page
)
print
(
list_of_words
)
#list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
# if word.strip(PUNCTUATION) != '']
#if not caps:
# list_of_words = [word.lower() for word in list_of_words]
list_of_words
=
[
word
for
word
in
list_of_words
if
word
not
in
stopwords
.
words
(
'english'
)]
#list_of_words = [word for word in list_of_words \
# if not word.startswith(STOP_PREFIXES)]
return
list_of_words
...
...
@@ -424,9 +412,7 @@ class Article:
words_lst
=
Counter
()
strng_lst
=
self
.
n_gram
(
string_lst
,
False
,
1
)
print
(
strng_lst
)
words_lst
.
update
(
w
for
w
in
strng_lst
)
print
(
words_lst
)
common_words
=
[
w
for
w
in
words_lst
.
most_common
(
15
)]
common_words
=
[
w
[
0
][
0
]
for
w
in
common_words
]
return
common_words
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment