Commit 41539005 by Michelle Awh

GOT WORDS AND TAGS WORKING EYAHAHAHAHAHAHAHA

parent 97c58eda
Showing with 26 additions and 86 deletions
......@@ -14,6 +14,7 @@ import re
import urllib.parse
import sys
import unicodedata
from collections import Counter
......@@ -229,11 +230,8 @@ class Article:
self.rating = self.find_rating(self.__soup)
self.languages = self.find_languages(self.__soup)
self.view = self.__repr__()
self.words = self.scrape_words(self.url, self.__soup)
'''
self.salient = self.find_salient_ngrams(self.words,\
1, False, 0.5)
'''
self.words = self.scrape_words(self.__soup)
self.tags = self.find_common_words(self.words)
def get_page_soup(self, url):
response = requests.get(url)
......@@ -324,7 +322,7 @@ class Article:
return translations
def scrape_words(self, url, soup):
def scrape_words(self, soup):
"""
Take all the steps and descriptions from a given
wikihow articles and lump into one large string
......@@ -332,18 +330,19 @@ class Article:
title = soup.find("h1", id = "section_0").text
description = soup.find("div", class_ = "mf-section-0").text
steps = soup.find_all("div", class_ = "step")
text = []
main_text = []
for step in steps:
s = str(step)
match = re.search('/b>(.*)<sup', s)
if match != None:
main_text.append(match.group(1))
top = title + description
top = re.findall("[^\n][a-zA-Z']+", top)
for y in top:
y = y.strip()
text.append(y)
for s in steps:
t = re.findall("[^\n][a-zA-Z']+", s.text)
for x in t:
x = x.strip()
text.append(x)
return " ".join(text)
print("AAA")
text = [y.strip() for y in top]
for t in main_text:
text += t.split()
return ' '.join(text)
def all_peripheral_links(self, soup, url):
......@@ -415,35 +414,23 @@ class Article:
links.append(filtered_link)
return links
'''
def find_salient_ngrams(self, wh_page, n, case_sensitive, threshold):
all_n_grams = []
ngram = n_gram_salient(wh_page,case_sensitive,n)
all_n_grams.append(ngram)
salient_ngrams = find_salient(all_n_grams,threshold)
return salient_ngrams
def pre_processing(self, wh_page,caps, salient):
def pre_processing(self, wh_page, caps):
list_of_words = wh_page.split()
list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
if word.strip(PUNCTUATION) != '']
if not caps:
list_of_words = [word.lower() for word in list_of_words]
if not salient: #if we want to do things like find top k or find min etc.
list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
list_of_words = [word for word in list_of_words \
if not word.startswith(STOP_PREFIXES)]
return list_of_words
def n_gram_salient(self, wh_page, caps, n):
n_gram_lst = []
list_of_words = pre_processing(wh_page,caps)
list_of_words = self.pre_processing(wh_page,caps)
start_value = 0
stop_value = n
while stop_value <= len(list_of_words):
......@@ -452,63 +439,16 @@ class Article:
stop_value +=1
return n_gram_lst
def find_common_words(self, string):
words_lst = Counter()
string_lst = self.n_gram_salient(string, False, 1)
words_lst.update(w for w in string_lst)
common_words = [w for w in words_lst.most_common(15)]
common_words = [w[0][0] for w in common_words]
return common_words
def calc_tf(self, a_token, tokens):
"""
calculates term frequency
"""
token_list = count_tokens(tokens)
max_term = max(token_list.values())
f_term = token_list[a_token]
tf = 0.5+0.5*(f_term/max_term)
return tf
def count_tokens(self, tokens):
token_dict = {}
for token in tokens:
if token not in token_dict:
token_dict[token] = 1
else:
token_dict[token] += 1
return token_dict
def calc_idf(self, docs,a_token):
"""
calculates inverse document frequency
"""
docs_with_t = 0
N = len(docs)
for lst in docs:
if a_token in lst:
docs_with_t +=1
idf = math.log(N/docs_with_t)
return idf
def find_salient(self, docs, threshold):
"""
Compute the salient words for each document. A word is salient if
its tf-idf score is strictly above a given threshold.
Inputs:
docs: list of list of tokens
threshold: float
Returns: list of sets of salient words
"""
salient_lst = []
for tok_lst in docs:
new_set = set()
for a_token in tok_lst:
tf_idf = calc_tf(a_token,tok_lst) * calc_idf(docs, a_token)
if tf_idf > threshold:
new_set.add(a_token)
salient_lst.append(new_set)
return salient_lst
'''
def __repr__(self):
info = self.title
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment