GOT ALL THE RELATED ARTICLE STUFF WORKING YEEEEEEEEEE

1e33abd4 · Michelle Awh · 41539005 · 1e33abd4 · 1e33abd4 · 1e33abd4
Commit 1e33abd4 authored Mar 11, 2021 by Michelle Awh
Showing with 59 additions and 11 deletions
SearchResults.py
__pycache__/SearchResults.cpython-38.pyc
__pycache__/WikiHowHow.cpython-38.pyc
--- a/SearchResults.py
+++ b/SearchResults.py
@@ -225,13 +225,16 @@ class Article:
        self.author, self.author_bio = self.get_author(self.__soup)
        self.num_coauthors, self.date_updated, self.num_views = \
            self.get_sp_text_data(self.__soup)
-        self.url = None
+        self.url = url
        self.pct_helpful, self.num_voters = self.get_helpful(self.__soup)
        self.rating = self.find_rating(self.__soup)
        self.languages = self.find_languages(self.__soup)
        self.view = self.__repr__()
        self.words = self.scrape_words(self.__soup)
        self.tags = self.find_common_words(self.words)
+        self.link_family = self.all_peripheral_links(self.__soup, self.url)
+        print('making related titles')
+        self.related_articles, self.related_categories = self.related_enough()

    def get_page_soup(self, url):
        response = requests.get(url)
@@ -338,7 +341,6 @@ class Article:
                main_text.append(match.group(1))
        top = title + description
        top = re.findall("[^\n][a-zA-Z']+", top)
-        print("AAA")
        text = [y.strip() for y in top]
        for t in main_text:
            text += t.split()
@@ -346,11 +348,13 @@ class Article:


    def all_peripheral_links(self, soup, url):
-        related_links = get_related(soup, url)
-        expanding, full_expanse = expanding_breadcrumbs(related_links)
-        link_family = ({**related_links, **expanding}, full_expanse)
-        #to expand further we simply call expanding_breadcrumbs on full expanse
-        return link_family
+        related_links = self.get_related(self.__soup, url)
+        expanding, full_expanse = self.expanding_breadcrumbs(related_links, soup)
+        link_family = {**related_links, **expanding, **full_expanse}
+        all_links = []
+        for f in link_family.values():
+            all_links += f
+        return all_links


    def get_related(self, soup, starting_url):
@@ -367,14 +371,14 @@ class Article:
        for id_ in ids:
            if soup.find(id='{}'.format(id_)) != None:
                links =  soup.find(id='{}'.format(id_)).find_all('a')
-                related[id_] = linked_urls(links, starting_url)
+                related[id_] = self.linked_urls(links, starting_url)
        for class_ in classes:
            box_a_lst = []
            peripherals = soup.find_all('div', class_ = '{}'.format(class_))
            for box in peripherals:
                if box.a != None:
                    box_a_lst += box.find_all('a')
-            related[class_] = linked_urls(box_a_lst, starting_url)
+            related[class_] = self.linked_urls(box_a_lst, starting_url)
        return related


@@ -392,12 +396,12 @@ class Article:
                        for relative in relatives:
                            if relative.a != None:
                                full_breadcrumbs += relative.find_all('a')
-                        expanded[listing] = linked_urls(full_breadcrumbs, listing)
+                        expanded[listing] = self.linked_urls(full_breadcrumbs, listing)
                        distant_relatives = soup.find_all('div', class_ = "subcat_container")
                        for category in distant_relatives:
                            if category.a != None:
                                loose_breadcrumbs += category.find_all('a')
-                        full_expanse[listing] = linked_urls(loose_breadcrumbs, listing)
+                        full_expanse[listing] = self.linked_urls(loose_breadcrumbs, listing)
        return expanded, full_expanse


@@ -415,6 +419,50 @@ class Article:

        return links

+
+
+    def extract_titles(self, lst):
+        titles = {}
+        categories = {}
+        for url in lst:
+            if 'Special:CategoryListing' in url:
+                continue
+            if 'Category:' in url:
+                try:
+                    hyphen_title = re.search('Category:(.*)', url).group(1)
+                    title = ' '.join(hyphen_title.split('-'))
+                except AttributeError:
+                    continue
+                categories[title] = url
+            else:
+                try:
+                    hyphen_title = re.search('.com/(.*)', url).group(1)
+                    title = ' '.join(hyphen_title.split('-'))
+                except AttributeError:
+                    continue
+                titles[title] = url
+        return (titles, categories)
+
+
+    def related_enough(self):
+        links = self.all_peripheral_links(self.__soup, self.url)
+        articles, categories = self.extract_titles(links)
+        actual_related_articles = {}
+        actual_related_categories = {}
+        for t in articles:
+            for tag in self.tags:
+                Tag = tag[0].upper() + tag[1:]
+                if Tag in t:
+                    actual_related_articles[t] = articles[t]
+                    continue
+        for t in categories:
+            for tag in self.tags:
+                Tag = tag[0].upper() + tag[1:] #haha get it
+                if Tag in t:
+                    actual_related_categories[t] = categories[t]
+                    continue
+        return (actual_related_articles, actual_related_categories)
+
    
    def pre_processing(self, wh_page, caps):
        list_of_words = wh_page.split()

--- a/__pycache__/SearchResults.cpython-38.pyc
+++ b/__pycache__/SearchResults.cpython-38.pyc
--- a/__pycache__/WikiHowHow.cpython-38.pyc
+++ b/__pycache__/WikiHowHow.cpython-38.pyc