Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2021-155
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2021-155
2021-155
Commits
996a2155
Commit
996a2155
authored
Jul 03, 2021
by
dasunx
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Medium article and dev.to article scrappers added
parent
4a46704a
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
234 additions
and
89 deletions
+234
-89
backend/models/automatedAnswer.js
backend/models/automatedAnswer.js
+31
-3
backend/models/blogArticle.js
backend/models/blogArticle.js
+0
-22
backend/python/auto-answer/Dev.py
backend/python/auto-answer/Dev.py
+75
-0
backend/python/auto-answer/Medium.py
backend/python/auto-answer/Medium.py
+95
-54
backend/python/auto-answer/requirements.txt
backend/python/auto-answer/requirements.txt
+3
-1
backend/python/auto-answer/scrapper.py
backend/python/auto-answer/scrapper.py
+29
-9
backend/requirements.txt
backend/requirements.txt
+1
-0
No files found.
backend/models/automatedAnswer.js
View file @
996a2155
...
...
@@ -29,10 +29,38 @@ const AutomatedAnswerSchema = mongoose.Schema({
type
:
String
}
],
blog
s
:
[
medium_article
s
:
[
{
type
:
Schema
.
Types
.
ObjectId
,
ref
:
'
BlogArticle
'
title
:
String
,
pubDate
:
String
,
link
:
String
,
guid
:
String
,
author
:
String
,
thumbnail
:
String
,
description
:
String
,
content
:
String
}
],
dev_articles
:
[
{
title
:
String
,
pubDate
:
String
,
link
:
String
,
guid
:
String
,
author
:
String
,
thumbnail
:
String
,
description
:
String
,
content
:
String
}
],
medium_resources
:
[
{
type
:
String
}
],
dev_resources
:
[
{
type
:
String
}
]
});
...
...
backend/models/blogArticle.js
deleted
100644 → 0
View file @
4a46704a
const
mongoose
=
require
(
'
mongoose
'
);
const
BlogArticleSchema
=
mongoose
.
Schema
({
automatedAnswer
:
{
type
:
Schema
.
Types
.
ObjectId
,
ref
:
'
AutomatedAnswer
'
,
required
:
true
},
blogName
:
{
type
:
String
,
required
:
true
},
link
:
{
type
:
String
,
required
:
true
},
content
:
{
type
:
String
}
});
module
.
exports
=
mongoose
.
model
(
'
BlogArticle
'
,
BlogArticleSchema
);
backend/python/auto-answer/Dev.py
0 → 100644
View file @
996a2155
from
search_engine_parser
import
GoogleSearch
import
re
import
requests
import
random
class
DevTo
:
def
__init__
(
self
,
title
,
tags
):
self
.
title
=
title
self
.
tags
=
tags
def
getApiKey
(
self
):
api_keys
=
[
"2rk1eg4sexdnp5umrwtwbtwd2insqvgzvejooqrn"
,
"yit6ytfcs3ziawdgasfd3bgkbf4tef1m2nzdxvnz"
,
"mpawymyrc6derrwmgodowfsaabtuoes4iiwintd7"
,
]
return
random
.
choice
(
api_keys
)
def
google
(
self
,
query
):
search_args
=
(
query
,
1
)
gsearch
=
GoogleSearch
()
gresults
=
gsearch
.
search
(
*
search_args
)
return
gresults
[
"links"
]
def
getValidUrls
(
self
,
links
):
validUrls
=
[]
for
i
in
links
:
if
"dev.to"
in
i
:
uriTrimmed
=
re
.
match
(
r"^.*?\&sa="
,
i
[
29
:])
.
group
(
0
)
ur
=
uriTrimmed
.
replace
(
"&sa="
,
""
)
validUrls
.
append
(
ur
)
return
validUrls
def
getValidSets
(
self
,
validUrls
):
validSets
=
[]
for
url
in
validUrls
:
try
:
vset
=
{}
print
(
url
)
username
=
re
.
search
(
r"https://dev.to/([^/?]+)"
,
url
)
.
group
(
1
)
tag
=
re
.
search
(
r"https://dev.to/([^/?]+)/([^/?]+)"
,
url
)
.
group
(
2
)
vset
[
"username"
]
=
username
vset
[
"tag"
]
=
tag
validSets
.
append
(
vset
)
except
Exception
as
e
:
print
(
e
)
continue
return
validSets
def
getBlogs
(
self
,
username
,
tag
):
blog
=
{}
try
:
response
=
requests
.
get
(
f
"https://api.rss2json.com/v1/api.json?rss_url=https
%3
A
%2
F
%2
Fdev.to
%2
Ffeed
%2
F{username}&api_key={self.getApiKey()}"
)
if
response
.
status_code
==
200
:
res
=
response
.
json
()
for
item
in
res
[
"items"
]:
if
tag
in
item
[
"link"
]:
blog
=
item
except
Exception
as
e
:
print
(
e
)
return
blog
def
getDevArticles
(
self
):
links
=
self
.
google
(
f
"site:dev.to {self.title} after:2020-01-01"
)
validUrls
=
self
.
getValidUrls
(
links
)
validSets
=
self
.
getValidSets
(
validUrls
)
blogs
=
[]
for
validset
in
validSets
:
blog
=
self
.
getBlogs
(
validset
[
"username"
],
validset
[
"tag"
])
if
bool
(
blog
):
blogs
.
append
(
blog
)
return
{
"blogs"
:
blogs
,
"resources"
:
validUrls
}
backend/python/auto-answer/Medium.py
View file @
996a2155
import
requests
from
requests_html
import
HTMLSession
from
bs4
import
BeautifulSoup
import
json
from
search_engine_parser
import
GoogleSearch
import
re
from
lxml
import
etree
import
requests
import
random
class
Medium
:
def
__init__
(
self
,
qtitle
,
keywords
=
[],
description
=
""
):
self
.
qtitle
=
qtitle
self
.
keywords
=
keywords
self
.
description
=
description
self
.
urls
=
[]
self
.
session
=
HTMLSession
()
def
searchArticles
(
self
):
"""
Search details using google dorks,
With google dorks we can filter out other search results from other web sites.
"""
html_page
=
requests
.
get
(
f
"https://google.com/search?q=site
%3
Amedium.com+{self.qtitle}"
)
soup
=
BeautifulSoup
(
html_page
.
content
,
"html.parser"
)
for
link
in
soup
.
findAll
(
"a"
):
if
"https://medium.com"
in
link
[
"href"
]:
self
.
urls
.
append
(
self
.
extractMediumURLS
(
link
[
"href"
]))
self
.
viewArticle
(
self
.
urls
[
0
])
def
extractMediumURLS
(
self
,
uriString
):
"""
Remove unwanted characters from the url string and filter out the targeted url
"""
uriTrimmed
=
uriString
[
7
:]
uriTrimmed
=
re
.
match
(
r"^.*?\&sa="
,
uriTrimmed
)
.
group
(
0
)
return
uriTrimmed
.
replace
(
"&sa="
,
""
)
def
viewArticle
(
self
,
url
):
html_page
=
self
.
session
.
get
(
url
)
html_page
.
html
.
render
(
timeout
=
20
)
# soup = BeautifulSoup(html_page.content, "html.parser")
# dom = etree.HTML(str(soup))
with
open
(
"medium.html"
,
"wb"
)
as
med
:
med
.
write
(
html_page
.
content
)
med
.
close
()
with
open
(
"medium.html"
,
encoding
=
"utf8"
)
as
sf
:
soup
=
BeautifulSoup
(
sf
,
"html.parser"
)
dom
=
etree
.
HTML
(
str
(
soup
))
# art = dom.xpath('//*[@class="a b c"]')[0]
# print(etree.tostring(art))
title
=
dom
.
xpath
(
'//*[@class="ap aq ar as at ff av w"]/div/h1'
)[
0
]
.
text
article
=
dom
.
xpath
(
'//*[@class="ap aq ar as at ff av w"]'
)[
0
]
with
open
(
f
"article-{title.replace(' ','')}.html"
,
"wb"
)
as
artFile
:
artFile
.
write
(
etree
.
tostring
(
article
))
artFile
.
close
()
def
__init__
(
self
,
title
,
tags
):
self
.
title
=
title
self
.
tags
=
tags
def
getApiKey
(
self
):
"""
Returns an API key for retrieve json data
"""
api_keys
=
[
"2rk1eg4sexdnp5umrwtwbtwd2insqvgzvejooqrn"
,
"yit6ytfcs3ziawdgasfd3bgkbf4tef1m2nzdxvnz"
,
"mpawymyrc6derrwmgodowfsaabtuoes4iiwintd7"
,
]
return
random
.
choice
(
api_keys
)
def
google
(
self
,
query
):
"""
Use a query to search using google search enging
"""
search_args
=
(
query
,
1
)
gsearch
=
GoogleSearch
()
gresults
=
gsearch
.
search
(
*
search_args
)
return
gresults
[
"links"
]
def
getValidUrls
(
self
,
links
):
"""
Validate and filter out the urls.
Returns the urls that contain medium.com in it as a list
"""
validUrls
=
[]
for
i
in
links
:
if
"medium.com"
in
i
:
uriTrimmed
=
re
.
match
(
r"^.*?\&sa="
,
i
[
29
:])
.
group
(
0
)
ur
=
uriTrimmed
.
replace
(
"&sa="
,
""
)
validUrls
.
append
(
ur
)
return
validUrls
def
getValidSets
(
self
,
validUrls
):
"""
Extract usernames and article id's from article url
pass a list of urls => returns objects list that contain usernam and article id
"""
validSets
=
[]
for
url
in
validUrls
:
try
:
vset
=
{}
print
(
url
)
username
=
re
.
search
(
r"https://medium.com/([^/?]+)"
,
url
)
.
group
(
1
)
tag
=
re
.
search
(
r"https://medium.com/([^/?]+)/([^/?]+)"
,
url
)
.
group
(
2
)
vset
[
"username"
]
=
username
vset
[
"tag"
]
=
tag
validSets
.
append
(
vset
)
except
Exception
as
e
:
print
(
e
)
continue
return
validSets
def
getBlogs
(
self
,
username
,
tag
):
"""
Get the content of the article
"""
blog
=
{}
try
:
response
=
requests
.
get
(
f
"https://api.rss2json.com/v1/api.json?rss_url=https
%3
A
%2
F
%2
Fmedium.com
%2
Ffeed
%2
F{username}&api_key={self.getApiKey()}"
)
if
response
.
status_code
==
200
:
res
=
response
.
json
()
for
item
in
res
[
"items"
]:
if
tag
in
item
[
"link"
]:
blog
=
item
except
Exception
as
e
:
print
(
e
)
return
blog
def
getMediumArticles
(
self
):
"""
return a list of articles and/or resources
"""
links
=
self
.
google
(
f
"site:medium.com {self.title} after:2020-01-01"
)
validUrls
=
self
.
getValidUrls
(
links
)
validSets
=
self
.
getValidSets
(
validUrls
)
blogs
=
[]
for
validset
in
validSets
:
blog
=
self
.
getBlogs
(
validset
[
"username"
],
validset
[
"tag"
])
if
bool
(
blog
):
blogs
.
append
(
blog
)
with
open
(
"ff.json"
,
"w"
)
as
f
:
json
.
dump
({
"blogs"
:
blogs
,
"resources"
:
validUrls
},
f
)
f
.
close
()
return
{
"blogs"
:
blogs
,
"resources"
:
validUrls
}
backend/python/auto-answer/requirements.txt
View file @
996a2155
bson==0.5.10
beautifulsoup4==4.9.3
dnspython==2.1.0
lxml==4.6.1
...
...
@@ -6,4 +7,5 @@ regex==2020.7.14
requests==2.24.0
requests-html==0.10.0
scipy==1.5.4
search-engine-parser==0.6.2
youtube-search-python==1.4.6
backend/python/auto-answer/scrapper.py
View file @
996a2155
from
youtube
import
Youtube
from
Medium
import
Medium
from
Dev
import
DevTo
from
stof
import
STOF
import
sys
from
database
import
get_database
def
saveAnswer
(
ans_id
,
stackoverflow
,
videos
):
def
saveAnswer
(
ans_id
,
stackoverflow
,
videos
,
medium_r
,
dev_r
):
db
=
get_database
()
try
:
from
bson.objectid
import
ObjectId
...
...
@@ -13,7 +14,26 @@ def saveAnswer(ans_id, stackoverflow, videos):
automatedanswers
=
db
[
"automatedanswers"
]
automatedanswers
.
update_one
(
{
"_id"
:
ObjectId
(
ans_id
)},
{
"$set"
:
{
"youtube"
:
videos
,
"stackoverflow"
:
stackoverflow
}},
{
"$set"
:
{
"youtube"
:
videos
,
"stackoverflow"
:
stackoverflow
,
"medium_articles"
:
medium_r
[
"blogs"
],
"dev_articles"
:
dev_r
[
"blogs"
],
"medium_resources"
:
medium_r
[
"resources"
],
"dev_resources"
:
dev_r
[
"resources"
],
}
},
)
print
(
{
"youtube"
:
videos
,
"stackoverflow"
:
stackoverflow
,
"medium_articles"
:
medium_r
[
"blogs"
],
"dev_articles"
:
dev_r
[
"blogs"
],
"medium_resources"
:
medium_r
[
"resources"
],
"dev_resources"
:
dev_r
[
"resources"
],
}
)
except
NameError
as
err
:
print
(
err
)
...
...
@@ -23,19 +43,19 @@ if __name__ == "__main__":
# title = input("Enter question title: ")
title
=
sys
.
argv
[
1
]
# "python django or flask for web development"
tags
=
sys
.
argv
[
2
]
# ["react"]
AUTO_ANS_ID
=
sys
.
argv
[
3
]
# "60d
746076689344694ad9e30" #
AUTO_ANS_ID
=
sys
.
argv
[
3
]
# "60d
c9a5f84692f001569d7ab"
stack
=
STOF
(
title
)
ans
=
stack
.
searchQuestion
()
print
(
ans
)
# medium = Medium(title
)
# medium.search
Articles()
# f = open("data.txt", "a")
# f.write(f"updated {title} {tags} {AUTO_ANS_ID}\n"
)
# f.close
()
medium
=
Medium
(
title
,
tags
)
medium_articels
=
medium
.
getMedium
Articles
()
devto
=
DevTo
(
title
,
tags
)
dev_articles
=
devto
.
getDevArticles
()
youtube
=
Youtube
(
title
,
tags
)
videos
=
youtube
.
find_videos
()
saveAnswer
(
AUTO_ANS_ID
,
ans
,
videos
)
saveAnswer
(
AUTO_ANS_ID
,
ans
,
videos
,
medium_articels
,
dev_articles
)
print
(
"WORKED"
)
sys
.
stdout
.
flush
()
backend/requirements.txt
View file @
996a2155
...
...
@@ -7,4 +7,5 @@ regex==2020.7.14
requests==2.24.0
requests-html==0.10.0
scipy==1.5.4
search-engine-parser==0.6.2
youtube-search-python==1.4.6
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment