Commit e7dc01bb by Alex Chang

added some docstrings

parent 9eb0b792
Showing with 13 additions and 27 deletions
......@@ -14,26 +14,11 @@ import re
import urllib.parse
import sys
import unicodedata
import nltk
from nltk.corpus import stopwords
def keep_chr(ch):
'''
Find all characters that are classifed as punctuation in Unicode
(except #, @, &) and combine them into a single string.
'''
return unicodedata.category(ch).startswith('P') and \
(ch not in ("#", "@", "&"))
PUNCTUATION = " ".join([chr(i) for i in range(sys.maxunicode)
if keep_chr(chr(i))])
STOP_WORDS = ["a", "an", "the", "this", "that", "of", "for", "or",
"and", "on", "to", "be", "if", "we", "you", "in", "is",
"at", "it", "rt", "mt", "with", "t", "don", "wh"]
STOP_PREFIXES = ("@", "#", "http", "&amp")
......@@ -409,14 +394,17 @@ class Article:
def pre_processing(self, wh_page, caps):
list_of_words = wh_page.split()
list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
if word.strip(PUNCTUATION) != '']
if not caps:
list_of_words = [word.lower() for word in list_of_words]
list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
list_of_words = [word for word in list_of_words \
if not word.startswith(STOP_PREFIXES)]
#list_of_words = wh_page.split()
tokenizer = nltk.RegexpTokenizer(r"\w+")
list_of_words = tokenizer.tokenize(wh_page)
print(list_of_words)
#list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
# if word.strip(PUNCTUATION) != '']
#if not caps:
# list_of_words = [word.lower() for word in list_of_words]
list_of_words = [word for word in list_of_words if word not in stopwords.words('english')]
#list_of_words = [word for word in list_of_words \
# if not word.startswith(STOP_PREFIXES)]
return list_of_words
......@@ -424,9 +412,7 @@ class Article:
words_lst = Counter()
strng_lst = self.n_gram(string_lst, False, 1)
print(strng_lst)
words_lst.update(w for w in strng_lst)
print(words_lst)
common_words = [w for w in words_lst.most_common(15)]
common_words = [w[0][0] for w in common_words]
return common_words
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment