Refactoring.
This commit is contained in:
parent
d5528bfb52
commit
9c4ba39558
|
@ -0,0 +1,80 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import re
|
||||
import requests
|
||||
from time import sleep
|
||||
from random import randint
|
||||
from datetime import datetime, timezone
|
||||
from ExtensionCrawler.util import *
|
||||
import dateutil
|
||||
import dateutil.parser
|
||||
|
||||
|
||||
def get_local_archive_dir(id):
|
||||
return "{}/{}".format(id[:3],id)
|
||||
|
||||
def get_local_archive_dirs(id):
|
||||
return [get_local_archive_dir(id)]
|
||||
|
||||
def write_text(dir, fname, text):
|
||||
with open(os.path.join(dir, fname), 'w') as f:
|
||||
f.write(text)
|
||||
|
||||
|
||||
def store_request_metadata(dir, fname, request):
|
||||
write_text(dir, fname + ".headers", str(request.headers))
|
||||
write_text(dir, fname + ".status", str(request.status_code))
|
||||
write_text(dir, fname + ".url", str(request.url))
|
||||
|
||||
|
||||
def store_request_text(dir, fname, request):
|
||||
write_text(dir, fname, request.text)
|
||||
store_request_metadata(dir, fname, request)
|
||||
|
||||
def httpdate(dt):
|
||||
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
|
||||
month = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
|
||||
"Nov", "Dec"
|
||||
][dt.month - 1]
|
||||
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
|
||||
weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
|
||||
|
||||
|
||||
def last_modified_utc_date(path):
|
||||
if path is "":
|
||||
return ""
|
||||
return os.path.split(os.path.dirname(path))[1]
|
||||
|
||||
|
||||
def last_modified_http_date(path):
|
||||
if path is "":
|
||||
return ""
|
||||
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
|
||||
def last_crx(dir, extid):
|
||||
old_archives = sorted(
|
||||
glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
|
||||
last_archive = ""
|
||||
if old_archives != []:
|
||||
last_archive = old_archives[-1]
|
||||
return last_archive
|
||||
|
|
@ -0,0 +1,76 @@
|
|||
#!/usr/bin/env python3
|
||||
#
|
||||
# Copyright (C) 2016,2017 The University of Sheffield, UK
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
|
||||
def const_sitemap_url():
|
||||
return "https://chrome.google.com/webstore/sitemap"
|
||||
|
||||
|
||||
def const_sitemap_scheme():
|
||||
return "http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
|
||||
|
||||
def const_overview_url(id):
|
||||
return 'https://chrome.google.com/webstore/detail/{}'.format(id)
|
||||
|
||||
|
||||
def const_store_url():
|
||||
return 'https://chrome.google.com/webstore'
|
||||
|
||||
|
||||
def const_review_url():
|
||||
return 'https://chrome.google.com/reviews/components'
|
||||
|
||||
|
||||
def const_support_url():
|
||||
return 'https://chrome.google.com/reviews/components'
|
||||
|
||||
|
||||
def const_download_url():
|
||||
return 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
|
||||
|
||||
|
||||
def const_categories():
|
||||
return [
|
||||
'extensions', 'ext/22-accessibility', 'ext/10-blogging',
|
||||
'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
|
||||
'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
|
||||
'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
|
||||
'ext/13-sports'
|
||||
]
|
||||
|
||||
|
||||
def const_support_payload(ext_id, start, end):
|
||||
return (
|
||||
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
|
||||
'"specs":[{{"type":"CommentThread",' +
|
||||
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
|
||||
+ '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
|
||||
'"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
|
||||
'"internedValues":[]}}').format(ext_id, start, end)
|
||||
|
||||
|
||||
def const_review_payload(ext_id, start, end):
|
||||
return (
|
||||
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
|
||||
'"specs":[{{"type":"CommentThread",' +
|
||||
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
|
||||
+ '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
|
||||
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
|
||||
'"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
|
||||
end)
|
53
crawler
53
crawler
|
@ -24,8 +24,9 @@ import requests
|
|||
from time import sleep
|
||||
from random import randint
|
||||
from datetime import datetime, timezone
|
||||
import ExtensionCrawler.discover
|
||||
import ExtensionCrawler.archive
|
||||
from ExtensionCrawler.discover import *
|
||||
from ExtensionCrawler.archive import *
|
||||
from ExtensionCrawler.util import *
|
||||
import dateutil
|
||||
import dateutil.parser
|
||||
|
||||
|
@ -114,23 +115,6 @@ class UpdateResult:
|
|||
def google_dos_protection(max=3):
|
||||
sleep(randint(1, max) * .5)
|
||||
|
||||
|
||||
def write_text(dir, fname, text):
|
||||
with open(os.path.join(dir, fname), 'w') as f:
|
||||
f.write(text)
|
||||
|
||||
|
||||
def store_request_metadata(dir, fname, request):
|
||||
write_text(dir, fname + ".headers", str(request.headers))
|
||||
write_text(dir, fname + ".status", str(request.status_code))
|
||||
write_text(dir, fname + ".url", str(request.url))
|
||||
|
||||
|
||||
def store_request_text(dir, fname, request):
|
||||
write_text(dir, fname, request.text)
|
||||
store_request_metadata(dir, fname, request)
|
||||
|
||||
|
||||
def log(verbose, msg):
|
||||
if verbose:
|
||||
sys.stdout.write(msg)
|
||||
|
@ -150,37 +134,6 @@ def update_overview(dir, verbose, ext_id):
|
|||
return RequestResult(res)
|
||||
|
||||
|
||||
def httpdate(dt):
|
||||
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
|
||||
month = [
|
||||
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
|
||||
"Nov", "Dec"
|
||||
][dt.month - 1]
|
||||
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
|
||||
weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
|
||||
|
||||
|
||||
def last_modified_utc_date(path):
|
||||
if path is "":
|
||||
return ""
|
||||
return os.path.split(os.path.dirname(path))[1]
|
||||
|
||||
|
||||
def last_modified_http_date(path):
|
||||
if path is "":
|
||||
return ""
|
||||
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
|
||||
|
||||
|
||||
def last_crx(dir, extid):
|
||||
old_archives = sorted(
|
||||
glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
|
||||
last_archive = ""
|
||||
if old_archives != []:
|
||||
last_archive = old_archives[-1]
|
||||
return last_archive
|
||||
|
||||
|
||||
def validate_crx_response(res, extfilename):
|
||||
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
||||
if not 'Content-Type' in res.headers:
|
||||
|
|
Loading…
Reference in New Issue