GOT WORDS AND TAGS WORKING EYAHAHAHAHAHAHAHA

41539005 · Michelle Awh · 97c58eda · 41539005
Commit 41539005 authored Mar 11, 2021 by Michelle Awh
Showing with 26 additions and 86 deletions
SearchResults.py
--- a/SearchResults.py
+++ b/SearchResults.py
@@ -14,6 +14,7 @@ import re
 import urllib.parse
 import sys
 import unicodedata
+from collections import Counter



@@ -229,11 +230,8 @@ class Article:
        self.rating = self.find_rating(self.__soup)
        self.languages = self.find_languages(self.__soup)
        self.view = self.__repr__()
-        self.words = self.scrape_words(self.url, self.__soup)
-        '''
-        self.salient = self.find_salient_ngrams(self.words,\
-                       1, False, 0.5)
-        '''
+        self.words = self.scrape_words(self.__soup)
+        self.tags = self.find_common_words(self.words)

    def get_page_soup(self, url):
        response = requests.get(url)
@@ -324,7 +322,7 @@ class Article:
        return translations


-    def scrape_words(self, url, soup):
+    def scrape_words(self, soup):
        """
        Take all the steps and descriptions from a given
        wikihow articles and lump into one large string
@@ -332,18 +330,19 @@ class Article:
        title = soup.find("h1", id = "section_0").text
        description = soup.find("div", class_ = "mf-section-0").text
        steps = soup.find_all("div", class_ = "step")
-        text = []
+        main_text = []
+        for step in steps:
+            s = str(step)
+            match = re.search('/b>(.*)<sup', s)
+            if match != None:
+                main_text.append(match.group(1))
        top = title + description
        top = re.findall("[^\n][a-zA-Z']+", top)
-        for y in top:
-            y = y.strip()
-            text.append(y)
-        for s in steps:
-            t = re.findall("[^\n][a-zA-Z']+", s.text)
-            for x in t:
-                x = x.strip()
-                text.append(x)
-        return " ".join(text)
+        print("AAA")
+        text = [y.strip() for y in top]
+        for t in main_text:
+            text += t.split()
+        return ' '.join(text)


    def all_peripheral_links(self, soup, url):
@@ -415,35 +414,23 @@ class Article:
                    links.append(filtered_link)

        return links
-    '''
-
-    def find_salient_ngrams(self, wh_page, n, case_sensitive, threshold):
-
-        all_n_grams = []
-        ngram = n_gram_salient(wh_page,case_sensitive,n)
-        all_n_grams.append(ngram)
-        salient_ngrams = find_salient(all_n_grams,threshold)
-        return salient_ngrams

    
-    def pre_processing(self, wh_page,caps, salient):
-        
+    def pre_processing(self, wh_page, caps):
        list_of_words = wh_page.split()
        list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
                                                if word.strip(PUNCTUATION) != '']
        if not caps:
            list_of_words = [word.lower() for word in list_of_words]
-        if not salient:  #if we want to do things like find top k or find min etc.
-            list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
+        list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
        list_of_words = [word for word in list_of_words \
                                            if not word.startswith(STOP_PREFIXES)]
        return list_of_words


    def n_gram_salient(self, wh_page, caps, n):
-        
        n_gram_lst = []
-        list_of_words = pre_processing(wh_page,caps)
+        list_of_words = self.pre_processing(wh_page,caps)
        start_value = 0
        stop_value = n
        while stop_value <= len(list_of_words):
@@ -452,63 +439,16 @@ class Article:
            stop_value +=1
        return n_gram_lst
    
+    
+    def find_common_words(self, string):
+        words_lst = Counter()
+        string_lst = self.n_gram_salient(string, False, 1)
+        words_lst.update(w for w in string_lst)
+        common_words = [w for w in words_lst.most_common(15)]
+        common_words = [w[0][0] for w in common_words]
+        return common_words

-    def calc_tf(self, a_token, tokens):
-        """
-        calculates term frequency
-        """
-        token_list = count_tokens(tokens)
-        max_term = max(token_list.values())
-        f_term = token_list[a_token]
-        tf = 0.5+0.5*(f_term/max_term)
-        return tf
-
-
-    def count_tokens(self, tokens):
-        
-        token_dict = {}
-        for token in tokens:
-            if token not in token_dict:
-                token_dict[token] = 1
-            else:
-                token_dict[token] += 1
-        return token_dict
-
-
-    def calc_idf(self, docs,a_token):
-        """
-        calculates inverse document frequency
-        """
-        docs_with_t = 0
-        N = len(docs)
-        for lst in docs:
-            if a_token in lst:
-                docs_with_t +=1
-        idf = math.log(N/docs_with_t)
-        return idf
-
-    def find_salient(self, docs, threshold):
-        """
-        Compute the salient words for each document.  A word is salient if
-        its tf-idf score is strictly above a given threshold.
-
-        Inputs:
-        docs: list of list of tokens
-        threshold: float

-        Returns: list of sets of salient words
-        """
-
-        salient_lst = []
-        for tok_lst in docs:
-            new_set = set() 
-            for a_token in tok_lst:
-                tf_idf = calc_tf(a_token,tok_lst) * calc_idf(docs, a_token)
-                if tf_idf > threshold:
-                    new_set.add(a_token)
-            salient_lst.append(new_set)        
-        return salient_lst
-    '''
    
    def __repr__(self):
        info = self.title