Commit 1e33abd4 by Michelle Awh

GOT ALL THE RELATED ARTICLE STUFF WORKING YEEEEEEEEEE

parent 41539005
......@@ -225,13 +225,16 @@ class Article:
self.author, self.author_bio = self.get_author(self.__soup)
self.num_coauthors, self.date_updated, self.num_views = \
self.get_sp_text_data(self.__soup)
self.url = None
self.url = url
self.pct_helpful, self.num_voters = self.get_helpful(self.__soup)
self.rating = self.find_rating(self.__soup)
self.languages = self.find_languages(self.__soup)
self.view = self.__repr__()
self.words = self.scrape_words(self.__soup)
self.tags = self.find_common_words(self.words)
self.link_family = self.all_peripheral_links(self.__soup, self.url)
print('making related titles')
self.related_articles, self.related_categories = self.related_enough()
def get_page_soup(self, url):
response = requests.get(url)
......@@ -338,7 +341,6 @@ class Article:
main_text.append(match.group(1))
top = title + description
top = re.findall("[^\n][a-zA-Z']+", top)
print("AAA")
text = [y.strip() for y in top]
for t in main_text:
text += t.split()
......@@ -346,11 +348,13 @@ class Article:
def all_peripheral_links(self, soup, url):
related_links = get_related(soup, url)
expanding, full_expanse = expanding_breadcrumbs(related_links)
link_family = ({**related_links, **expanding}, full_expanse)
#to expand further we simply call expanding_breadcrumbs on full expanse
return link_family
related_links = self.get_related(self.__soup, url)
expanding, full_expanse = self.expanding_breadcrumbs(related_links, soup)
link_family = {**related_links, **expanding, **full_expanse}
all_links = []
for f in link_family.values():
all_links += f
return all_links
def get_related(self, soup, starting_url):
......@@ -367,14 +371,14 @@ class Article:
for id_ in ids:
if soup.find(id='{}'.format(id_)) != None:
links = soup.find(id='{}'.format(id_)).find_all('a')
related[id_] = linked_urls(links, starting_url)
related[id_] = self.linked_urls(links, starting_url)
for class_ in classes:
box_a_lst = []
peripherals = soup.find_all('div', class_ = '{}'.format(class_))
for box in peripherals:
if box.a != None:
box_a_lst += box.find_all('a')
related[class_] = linked_urls(box_a_lst, starting_url)
related[class_] = self.linked_urls(box_a_lst, starting_url)
return related
......@@ -392,12 +396,12 @@ class Article:
for relative in relatives:
if relative.a != None:
full_breadcrumbs += relative.find_all('a')
expanded[listing] = linked_urls(full_breadcrumbs, listing)
expanded[listing] = self.linked_urls(full_breadcrumbs, listing)
distant_relatives = soup.find_all('div', class_ = "subcat_container")
for category in distant_relatives:
if category.a != None:
loose_breadcrumbs += category.find_all('a')
full_expanse[listing] = linked_urls(loose_breadcrumbs, listing)
full_expanse[listing] = self.linked_urls(loose_breadcrumbs, listing)
return expanded, full_expanse
......@@ -415,6 +419,50 @@ class Article:
return links
def extract_titles(self, lst):
titles = {}
categories = {}
for url in lst:
if 'Special:CategoryListing' in url:
continue
if 'Category:' in url:
try:
hyphen_title = re.search('Category:(.*)', url).group(1)
title = ' '.join(hyphen_title.split('-'))
except AttributeError:
continue
categories[title] = url
else:
try:
hyphen_title = re.search('.com/(.*)', url).group(1)
title = ' '.join(hyphen_title.split('-'))
except AttributeError:
continue
titles[title] = url
return (titles, categories)
def related_enough(self):
links = self.all_peripheral_links(self.__soup, self.url)
articles, categories = self.extract_titles(links)
actual_related_articles = {}
actual_related_categories = {}
for t in articles:
for tag in self.tags:
Tag = tag[0].upper() + tag[1:]
if Tag in t:
actual_related_articles[t] = articles[t]
continue
for t in categories:
for tag in self.tags:
Tag = tag[0].upper() + tag[1:] #haha get it
if Tag in t:
actual_related_categories[t] = categories[t]
continue
return (actual_related_articles, actual_related_categories)
def pre_processing(self, wh_page, caps):
list_of_words = wh_page.split()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment