Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Michelle Awh
/
project_kitty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
c4472f13
authored
Mar 12, 2021
by
Michelle Awh
Browse files
Options
Browse Files
Download
Plain Diff
a
parents
c72c2a34
7e06e29c
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
81 additions
and
0 deletions
SearchResults.py
SearchResults.py
View file @
c4472f13
...
...
@@ -94,6 +94,12 @@ class SearchResults:
def
get_search_soup
(
self
,
query
):
"""
Take a string of search terms and return a soup object
Input: query
Returns: soup object
"""
search
=
'+'
.
join
(
query
.
split
())
response
=
requests
.
get
(
'https://www.wikihow.com/wikiHowTo?search='
+
search
)
soup
=
BeautifulSoup
(
response
.
content
,
'html.parser'
)
...
...
@@ -101,6 +107,13 @@ class SearchResults:
def
passes_outer_filters
(
self
,
outer_article
):
"""
Using an outer_article object, check to make sure the
article fits the chosen filters
Input: outer_article object
Returns: Boolean representing if the article meets the given conditions
"""
if
'date_updated'
in
self
.
__filters
:
num
,
interval
=
outer_article
.
date_updated
.
split
()[:
2
]
num
=
int
(
num
)
...
...
@@ -125,6 +138,13 @@ class SearchResults:
def
get_required_words
(
self
,
query
):
"""
Given a string of words required for the results, split
and return as a list
Input: (str) query representing words to look for
Returns: List of search terms
"""
required_words
=
[]
match
=
re
.
findall
(
'".*"'
,
query
)
if
match
==
[]:
...
...
@@ -135,6 +155,13 @@ class SearchResults:
def
get_forbidden_words
(
self
,
query
):
"""
Given a string of words used to filter out results, split
and return as a list
Input: (str) query representing words to avoid in results
Returns: List of forbidden words
"""
forbidden_words
=
[]
match
=
re
.
search
(
'-.*$'
,
query
)
if
match
==
None
:
...
...
@@ -146,8 +173,18 @@ class SearchResults:
def
passes_all_filters
(
self
,
article
):
<<<<<<<
HEAD
if
'Main-Page'
in
article
.
url
:
return
False
=======
"""
Given an article object, check to make sure that the
article matches the conditions set by the filters
Input: article object
Returns: boolean representing if the conditions have been met
"""
>>>>>>>
7e06
e29c9767fc90bea8e3028de2e61851c4c33c
if
'child_safe'
in
self
.
__filters
:
if
self
.
__filters
[
'child_safe'
]:
self
.
__forbidden_words
+=
inappropriate_words_lst
...
...
@@ -197,6 +234,11 @@ class SearchResults:
def
__repr__
(
self
,
limit_results
=
None
):
"""
repr method for SearchResults class
Input: (int) limit_results max number of results to show
(will show all results if = None)
"""
info
=
'Results for '
+
self
.
__query
+
':
\n
'
if
limit_results
!=
None
:
results
=
self
.
articles
[:
limit_results
]
...
...
@@ -212,6 +254,11 @@ class SearchResults:
class
OuterArticle
:
def
__init__
(
self
,
result
):
"""
Class constructor for OuterArticle class
Input: result = a soup object representing a search directory
"""
self
.
title
=
result
.
find
(
'div'
,
class_
=
'result_title'
)
.
text
self
.
date_updated
=
' '
.
join
(
result
.
find
\
(
'li'
,
class_
=
'sr_updated'
)
.
text
.
split
()[
1
:])
...
...
@@ -229,6 +276,11 @@ class Article:
def
__init__
(
self
,
url
):
"""
Class constructor for the Article class
Input: url = url for a specific article
"""
self
.
__soup
=
self
.
get_page_soup
(
url
)
self
.
title
=
self
.
__soup
.
find
(
'h1'
)
.
text
self
.
sources
=
self
.
get_sources
(
self
.
__soup
)
...
...
@@ -249,11 +301,23 @@ class Article:
def
get_page_soup
(
self
,
url
):
"""
Take a url and return a soup object
Input: url of a specific article
Returns: BeautifulSoup object
"""
response
=
requests
.
get
(
url
)
return
BeautifulSoup
(
response
.
content
,
'html.parser'
)
def
get_sources
(
self
,
soup
):
"""
Take a soup object of an article and return the cited sources
Input: wikihow article soup object
Returns: list of cited sources
"""
divs
=
soup
.
find
(
'div'
,
id
=
'references'
)
try
:
sources
=
[
s
[
'href'
]
for
s
in
\
...
...
@@ -282,6 +346,12 @@ class Article:
def
get_author
(
self
,
soup
):
"""
Take a soup object of an article and return the main author name
Input: wikihow article soup object
Returns: Name of main author
"""
div
=
soup
.
find
(
'a'
,
class_
=
"sp_namelink"
)
try
:
name
=
div
.
string
...
...
@@ -292,6 +362,14 @@ class Article:
def
get_sp_text_data
(
self
,
soup
):
"""
Take a soup object of an article and return a list containing
number of co-authors, date updated, and views
Input: wikihow article soup object
Returns: list containing number of co-authors, date updated,
and number of views
"""
lst
=
[]
span
=
soup
.
find_all
(
'span'
,
class_
=
'sp_text_data'
)
for
x
in
span
:
...
...
@@ -392,6 +470,9 @@ class Article:
def
all_peripheral_links
(
self
,
soup
,
url
):
"""
"""
related_links
=
self
.
get_related
(
self
.
__soup
,
url
)
expanding
,
full_expanse
=
self
.
expanding_breadcrumbs
(
related_links
,
soup
)
link_family
=
{
**
related_links
,
**
expanding
,
**
full_expanse
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment