Commit 68a3a764 by Michelle Awh

sultry gift for u chris

parent 1125f4b9
Showing with 0 additions and 94 deletions
......@@ -14,7 +14,6 @@ import re
import urllib.parse
import sys
import unicodedata
<<<<<<< HEAD
from collections import Counter
import nltk
nltk.download('stopwords')
......@@ -24,28 +23,6 @@ inappropriate_words_lst = inappropriate_words()
=======
def keep_chr(ch):
'''
Find all characters that are classifed as punctuation in Unicode
(except #, @, &) and combine them into a single string.
'''
return unicodedata.category(ch).startswith('P') and \
(ch not in ("#", "@", "&"))
PUNCTUATION = " ".join([chr(i) for i in range(sys.maxunicode)
if keep_chr(chr(i))])
STOP_WORDS = ["a", "an", "the", "this", "that", "of", "for", "or",
"and", "on", "to", "be", "if", "we", "you", "in", "is",
"at", "it", "rt", "mt", "with", "t", "don", "wh"]
STOP_PREFIXES = ("@", "#", "http", "&amp")
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
......@@ -87,7 +64,6 @@ class SearchResults:
Returns:
a SearchResults object
'''
<<<<<<< HEAD
self.__filters = filters
self.__required_words = []
self.__required_words += self.get_required_words(query)
......@@ -98,19 +74,6 @@ class SearchResults:
else:
self.__query = query
self.__soup = self.get_search_soup(self.__query)
=======
self.__query = query
self.__filters = filters
self.__required_words = self.get_required_words(query)
if self.__required_words != None:
self.__filters['required_words'] = self.__required_words
self.__forbidden_words = self.get_forbidden_words(query)
if self.__forbidden_words != None:
self.__filters['forbidden_words'] = self.__forbidden_words
self.__soup = self.get_search_soup(query)
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
self.__links = [a['href'] for a in self.__soup.find_all(\
'a', class_='result_link')]
self.__results = self.__soup.find_all('a', class_ = 'result_link')
......@@ -121,17 +84,12 @@ class SearchResults:
self.__outer_articles = [r for r in self.__unfiltered_outer_articles\
if self.passes_outer_filters(r)]
self.__unfiltered_articles = [Article(r.url) for r in self.__outer_articles]
<<<<<<< HEAD
self.articles = []
for a in self.__unfiltered_articles:
if self.passes_all_filters(a):
self.articles.append(a)
for a in self.articles:
a.related_enough()
=======
self.articles = [a for a in self.__unfiltered_articles if\
self.passes_all_filters(a)]
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
def get_search_soup(self, query):
......@@ -142,11 +100,6 @@ class SearchResults:
def passes_outer_filters(self, outer_article):
<<<<<<< HEAD
print(self.__filters)
print('checking')
=======
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
if 'date_updated' in self.__filters:
num, interval = outer_article.date_updated.split()[:2]
num = int(num)
......@@ -207,16 +160,11 @@ class SearchResults:
if self.__filters['child_safe']:
self.__forbidden_words += inappropriate_words_lst
if 'pct_helpful' in self.__filters:
<<<<<<< HEAD
try:
if int(article.pct_helpful) < self.__filters['pct_helpful']:
return False
except TypeError:
pass
=======
if article.pct_helpful < self.__filters['pct_helpful']:
return False
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
if 'num_voters' in self.__filters:
if int(article.num_voters) < self.__filters['num_voters']:
return False
......@@ -489,7 +437,6 @@ class Article:
links.append(filtered_link)
return links
<<<<<<< HEAD
......@@ -536,12 +483,9 @@ class Article:
self.related_articles += actual_related_articles
self.related_categories += actual_related_categories
=======
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
def pre_processing(self, wh_page, caps):
<<<<<<< HEAD
tokenizer = nltk.RegexpTokenizer(r'\w+')
list_of_words = tokenizer.tokenize(wh_page)
list_of_words = [word for word in list_of_words if word not in stopwords.words('english')]
......@@ -549,33 +493,6 @@ class Article:
def n_gram(self, wh_page, caps, n):
=======
list_of_words = wh_page.split()
list_of_words = [word.strip(PUNCTUATION) for word in list_of_words \
if word.strip(PUNCTUATION) != '']
if not caps:
list_of_words = [word.lower() for word in list_of_words]
list_of_words = [word for word in list_of_words if word not in STOP_WORDS]
list_of_words = [word for word in list_of_words \
if not word.startswith(STOP_PREFIXES)]
return list_of_words
def find_common_words(self, string_lst):
words_lst = Counter()
strng_lst = self.n_gram(string_lst, False, 1)
print(strng_lst)
words_lst.update(w for w in strng_lst)
print(words_lst)
common_words = [w for w in words_lst.most_common(15)]
common_words = [w[0][0] for w in common_words]
return common_words
def n_gram(self, wh_page, caps, n):
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
n_gram_lst = []
list_of_words = self.pre_processing(wh_page,caps)
start_value = 0
......@@ -585,7 +502,6 @@ class Article:
start_value +=1
stop_value +=1
return n_gram_lst
<<<<<<< HEAD
def find_common_words(self, string):
......@@ -595,8 +511,6 @@ class Article:
common_words = [w for w in words_lst.most_common(15)]
common_words = [w[0][0] for w in common_words]
return common_words
=======
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
'''
def find_salient_ngrams(self, wh_page, n, case_sensitive, threshold):
......@@ -692,11 +606,3 @@ class Article:
else:
info += '\nNo Sources Cited'
return info
<<<<<<< HEAD
def test():
print(inappropriate_words_lst)
=======
>>>>>>> d2692ff014493cfae8c975b6a9b16d55f549c9fb
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment