added some docstrings

e7dc01bb · Alex Chang · 9eb0b792 · e7dc01bb
Commit e7dc01bb authored Mar 12, 2021 by Alex Chang
Showing with 13 additions and 27 deletions
SR_alex.py
--- a/SR_alex.py
+++ b/SR_alex.py
@@ -14,26 +14,11 @@ import re
 import urllib.parse
 import sys
 import unicodedata
+import nltk
+from nltk.corpus import stopwords



-def keep_chr(ch):
-    '''
-    Find all characters that are classifed as punctuation in Unicode
-    (except #, @, &) and combine them into a single string.
-    '''
-    return unicodedata.category(ch).startswith('P') and \
-        (ch not in ("#", "@", "&"))
-
-
-PUNCTUATION = " ".join([chr(i) for i in range(sys.maxunicode)
-                        if keep_chr(chr(i))])
-
-STOP_WORDS = ["a", "an", "the", "this", "that", "of", "for", "or",
-              "and", "on", "to", "be", "if", "we", "you", "in", "is",
-              "at", "it", "rt", "mt", "with", "t", "don", "wh"]
-
-STOP_PREFIXES = ("@", "#", "http", "&amp")



@@ -409,14 +394,17 @@ class Article:

    def pre_processing(self, wh_page, caps):
        
-        list_of_words = wh_page.split()
-        list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
-                                                if word.strip(PUNCTUATION) != '']
-        if not caps:
-            list_of_words = [word.lower() for word in list_of_words]        
-        list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
-        list_of_words = [word for word in list_of_words \
-                                            if not word.startswith(STOP_PREFIXES)]
+        #list_of_words = wh_page.split()
+        tokenizer = nltk.RegexpTokenizer(r"\w+")
+        list_of_words = tokenizer.tokenize(wh_page)
+        print(list_of_words)
+        #list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
+                                               # if word.strip(PUNCTUATION) != '']
+        #if not caps:
+           # list_of_words = [word.lower() for word in list_of_words]        
+        list_of_words = [word for word in list_of_words if word not in stopwords.words('english')]
+        #list_of_words = [word for word in list_of_words \
+                                           # if not word.startswith(STOP_PREFIXES)]
        return list_of_words


@@ -424,9 +412,7 @@ class Article:

        words_lst = Counter()
        strng_lst = self.n_gram(string_lst, False, 1)
-        print(strng_lst)
        words_lst.update(w for w in strng_lst)
-        print(words_lst)
        common_words = [w for w in words_lst.most_common(15)]
        common_words = [w[0][0] for w in common_words]
        return common_words