Refactoring.

This commit is contained in:
Achim D. Brucker 2017-01-28 12:52:18 +00:00
parent d5528bfb52
commit 9c4ba39558
3 changed files with 159 additions and 50 deletions

80
ExtensionCrawler/archive.py Executable file
View File

@ -0,0 +1,80 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import os
import sys
import glob
import re
import requests
from time import sleep
from random import randint
from datetime import datetime, timezone
from ExtensionCrawler.util import *
import dateutil
import dateutil.parser
def get_local_archive_dir(id):
return "{}/{}".format(id[:3],id)
def get_local_archive_dirs(id):
return [get_local_archive_dir(id)]
def write_text(dir, fname, text):
with open(os.path.join(dir, fname), 'w') as f:
f.write(text)
def store_request_metadata(dir, fname, request):
write_text(dir, fname + ".headers", str(request.headers))
write_text(dir, fname + ".status", str(request.status_code))
write_text(dir, fname + ".url", str(request.url))
def store_request_text(dir, fname, request):
write_text(dir, fname, request.text)
store_request_metadata(dir, fname, request)
def httpdate(dt):
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
month = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"
][dt.month - 1]
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
def last_modified_utc_date(path):
if path is "":
return ""
return os.path.split(os.path.dirname(path))[1]
def last_modified_http_date(path):
if path is "":
return ""
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
def last_crx(dir, extid):
old_archives = sorted(
glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
last_archive = ""
if old_archives != []:
last_archive = old_archives[-1]
return last_archive

76
ExtensionCrawler/util.py Normal file
View File

@ -0,0 +1,76 @@
#!/usr/bin/env python3
#
# Copyright (C) 2016,2017 The University of Sheffield, UK
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
def const_sitemap_url():
return "https://chrome.google.com/webstore/sitemap"
def const_sitemap_scheme():
return "http://www.sitemaps.org/schemas/sitemap/0.9"
def const_overview_url(id):
return 'https://chrome.google.com/webstore/detail/{}'.format(id)
def const_store_url():
return 'https://chrome.google.com/webstore'
def const_review_url():
return 'https://chrome.google.com/reviews/components'
def const_support_url():
return 'https://chrome.google.com/reviews/components'
def const_download_url():
return 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
def const_categories():
return [
'extensions', 'ext/22-accessibility', 'ext/10-blogging',
'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
'ext/13-sports'
]
def const_support_payload(ext_id, start, end):
return (
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
'"specs":[{{"type":"CommentThread",' +
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+ '"groups":"chrome_webstore_support",' + '"startindex":"{}",' +
'"numresults":"{}",' + '"id":"379"}}],' + '"internedKeys":[],' +
'"internedValues":[]}}').format(ext_id, start, end)
def const_review_payload(ext_id, start, end):
return (
'req={{ "appId":94,' + '"version":"150922",' + '"hl":"en",' +
'"specs":[{{"type":"CommentThread",' +
'"url":"http%3A%2F%2Fchrome.google.com%2Fextensions%2Fpermalink%3Fid%3D{}",'
+ '"groups":"chrome_webstore",' + '"sortby":"cws_qscore",' +
'"startindex":"{}",' + '"numresults":"{}",' + '"id":"428"}}],' +
'"internedKeys":[],' + '"internedValues":[]}}').format(ext_id, start,
end)

53
crawler
View File

@ -24,8 +24,9 @@ import requests
from time import sleep
from random import randint
from datetime import datetime, timezone
import ExtensionCrawler.discover
import ExtensionCrawler.archive
from ExtensionCrawler.discover import *
from ExtensionCrawler.archive import *
from ExtensionCrawler.util import *
import dateutil
import dateutil.parser
@ -114,23 +115,6 @@ class UpdateResult:
def google_dos_protection(max=3):
sleep(randint(1, max) * .5)
def write_text(dir, fname, text):
with open(os.path.join(dir, fname), 'w') as f:
f.write(text)
def store_request_metadata(dir, fname, request):
write_text(dir, fname + ".headers", str(request.headers))
write_text(dir, fname + ".status", str(request.status_code))
write_text(dir, fname + ".url", str(request.url))
def store_request_text(dir, fname, request):
write_text(dir, fname, request.text)
store_request_metadata(dir, fname, request)
def log(verbose, msg):
if verbose:
sys.stdout.write(msg)
@ -150,37 +134,6 @@ def update_overview(dir, verbose, ext_id):
return RequestResult(res)
def httpdate(dt):
weekday = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"][dt.weekday()]
month = [
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
"Nov", "Dec"
][dt.month - 1]
return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
weekday, dt.day, month, dt.year, dt.hour, dt.minute, dt.second)
def last_modified_utc_date(path):
if path is "":
return ""
return os.path.split(os.path.dirname(path))[1]
def last_modified_http_date(path):
if path is "":
return ""
return httpdate(dateutil.parser.parse(last_modified_utc_date(path)))
def last_crx(dir, extid):
old_archives = sorted(
glob.glob(os.path.join(os.path.dirname(dir), "*/*.crx")))
last_archive = ""
if old_archives != []:
last_archive = old_archives[-1]
return last_archive
def validate_crx_response(res, extfilename):
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
if not 'Content-Type' in res.headers: