Skip to content
Toggle navigation
P
Projects
G
Groups
S
Snippets
Help
Michelle Awh
/
project_kitty
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Pipelines
Members
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit
1e33abd4
authored
Mar 11, 2021
by
Michelle Awh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
GOT ALL THE RELATED ARTICLE STUFF WORKING YEEEEEEEEEE
parent
41539005
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
59 additions
and
11 deletions
SearchResults.py
__pycache__/SearchResults.cpython-38.pyc
__pycache__/WikiHowHow.cpython-38.pyc
SearchResults.py
View file @
1e33abd4
...
...
@@ -225,13 +225,16 @@ class Article:
self
.
author
,
self
.
author_bio
=
self
.
get_author
(
self
.
__soup
)
self
.
num_coauthors
,
self
.
date_updated
,
self
.
num_views
=
\
self
.
get_sp_text_data
(
self
.
__soup
)
self
.
url
=
None
self
.
url
=
url
self
.
pct_helpful
,
self
.
num_voters
=
self
.
get_helpful
(
self
.
__soup
)
self
.
rating
=
self
.
find_rating
(
self
.
__soup
)
self
.
languages
=
self
.
find_languages
(
self
.
__soup
)
self
.
view
=
self
.
__repr__
()
self
.
words
=
self
.
scrape_words
(
self
.
__soup
)
self
.
tags
=
self
.
find_common_words
(
self
.
words
)
self
.
link_family
=
self
.
all_peripheral_links
(
self
.
__soup
,
self
.
url
)
print
(
'making related titles'
)
self
.
related_articles
,
self
.
related_categories
=
self
.
related_enough
()
def
get_page_soup
(
self
,
url
):
response
=
requests
.
get
(
url
)
...
...
@@ -338,7 +341,6 @@ class Article:
main_text
.
append
(
match
.
group
(
1
))
top
=
title
+
description
top
=
re
.
findall
(
"[^
\n
][a-zA-Z']+"
,
top
)
print
(
"AAA"
)
text
=
[
y
.
strip
()
for
y
in
top
]
for
t
in
main_text
:
text
+=
t
.
split
()
...
...
@@ -346,11 +348,13 @@ class Article:
def
all_peripheral_links
(
self
,
soup
,
url
):
related_links
=
get_related
(
soup
,
url
)
expanding
,
full_expanse
=
expanding_breadcrumbs
(
related_links
)
link_family
=
({
**
related_links
,
**
expanding
},
full_expanse
)
#to expand further we simply call expanding_breadcrumbs on full expanse
return
link_family
related_links
=
self
.
get_related
(
self
.
__soup
,
url
)
expanding
,
full_expanse
=
self
.
expanding_breadcrumbs
(
related_links
,
soup
)
link_family
=
{
**
related_links
,
**
expanding
,
**
full_expanse
}
all_links
=
[]
for
f
in
link_family
.
values
():
all_links
+=
f
return
all_links
def
get_related
(
self
,
soup
,
starting_url
):
...
...
@@ -367,14 +371,14 @@ class Article:
for
id_
in
ids
:
if
soup
.
find
(
id
=
'{}'
.
format
(
id_
))
!=
None
:
links
=
soup
.
find
(
id
=
'{}'
.
format
(
id_
))
.
find_all
(
'a'
)
related
[
id_
]
=
linked_urls
(
links
,
starting_url
)
related
[
id_
]
=
self
.
linked_urls
(
links
,
starting_url
)
for
class_
in
classes
:
box_a_lst
=
[]
peripherals
=
soup
.
find_all
(
'div'
,
class_
=
'{}'
.
format
(
class_
))
for
box
in
peripherals
:
if
box
.
a
!=
None
:
box_a_lst
+=
box
.
find_all
(
'a'
)
related
[
class_
]
=
linked_urls
(
box_a_lst
,
starting_url
)
related
[
class_
]
=
self
.
linked_urls
(
box_a_lst
,
starting_url
)
return
related
...
...
@@ -392,12 +396,12 @@ class Article:
for
relative
in
relatives
:
if
relative
.
a
!=
None
:
full_breadcrumbs
+=
relative
.
find_all
(
'a'
)
expanded
[
listing
]
=
linked_urls
(
full_breadcrumbs
,
listing
)
expanded
[
listing
]
=
self
.
linked_urls
(
full_breadcrumbs
,
listing
)
distant_relatives
=
soup
.
find_all
(
'div'
,
class_
=
"subcat_container"
)
for
category
in
distant_relatives
:
if
category
.
a
!=
None
:
loose_breadcrumbs
+=
category
.
find_all
(
'a'
)
full_expanse
[
listing
]
=
linked_urls
(
loose_breadcrumbs
,
listing
)
full_expanse
[
listing
]
=
self
.
linked_urls
(
loose_breadcrumbs
,
listing
)
return
expanded
,
full_expanse
...
...
@@ -415,6 +419,50 @@ class Article:
return
links
def
extract_titles
(
self
,
lst
):
titles
=
{}
categories
=
{}
for
url
in
lst
:
if
'Special:CategoryListing'
in
url
:
continue
if
'Category:'
in
url
:
try
:
hyphen_title
=
re
.
search
(
'Category:(.*)'
,
url
)
.
group
(
1
)
title
=
' '
.
join
(
hyphen_title
.
split
(
'-'
))
except
AttributeError
:
continue
categories
[
title
]
=
url
else
:
try
:
hyphen_title
=
re
.
search
(
'.com/(.*)'
,
url
)
.
group
(
1
)
title
=
' '
.
join
(
hyphen_title
.
split
(
'-'
))
except
AttributeError
:
continue
titles
[
title
]
=
url
return
(
titles
,
categories
)
def
related_enough
(
self
):
links
=
self
.
all_peripheral_links
(
self
.
__soup
,
self
.
url
)
articles
,
categories
=
self
.
extract_titles
(
links
)
actual_related_articles
=
{}
actual_related_categories
=
{}
for
t
in
articles
:
for
tag
in
self
.
tags
:
Tag
=
tag
[
0
]
.
upper
()
+
tag
[
1
:]
if
Tag
in
t
:
actual_related_articles
[
t
]
=
articles
[
t
]
continue
for
t
in
categories
:
for
tag
in
self
.
tags
:
Tag
=
tag
[
0
]
.
upper
()
+
tag
[
1
:]
#haha get it
if
Tag
in
t
:
actual_related_categories
[
t
]
=
categories
[
t
]
continue
return
(
actual_related_articles
,
actual_related_categories
)
def
pre_processing
(
self
,
wh_page
,
caps
):
list_of_words
=
wh_page
.
split
()
...
...
__pycache__/SearchResults.cpython-38.pyc
View file @
1e33abd4
No preview for this file type
__pycache__/WikiHowHow.cpython-38.pyc
View file @
1e33abd4
No preview for this file type
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment