Improved crawler script.

This commit is contained in:
Michael Herzberg 2016-09-15 10:52:40 +01:00
parent d669b39493
commit 5b1bcd3a88
1 changed files with 34 additions and 17 deletions

View File

@ -5,6 +5,7 @@ import sys
import os
import json
import re
import argparse
class Error(Exception):
pass
@ -20,18 +21,19 @@ class UnauthorizedError(Error):
self.extid = extid
class ExtensionCrawler:
possible_categories = ['extensions', 'ext/22-accessibility', 'ext/10-blogging', 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', 'ext/13-sports']
regexExtid = re.compile(r'^[a-z]+$')
regexExtfilename = re.compile(r'^extension[_0-9]+\.crx$')
downloadUrl = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
extensionListUrl = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count=200&category=extensions'
detailUrl = 'https://chrome.google.com/webstore/detail/{}'
download_url = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count={}&category={}'
detail_url = 'https://chrome.google.com/webstore/detail/{}'
def __init__(self, basedir):
self.basedir = basedir
def downloadExtension(self, extid, extdir=""):
extresult = requests.get(self.downloadUrl.format(extid), stream=True)
def download_extension(self, extid, extdir=""):
extresult = requests.get(self.download_url.format(extid), stream=True)
if extresult.status_code == 401:
raise UnauthorizedError(extid)
if not 'Content-Type' in extresult.headers:
@ -46,12 +48,12 @@ class ExtensionCrawler:
if chunk: # filter out keep-alive new chunks
f.write(chunk)
def downloadStorepage(self, extid, extdir):
extpageresult = requests.get(self.detailUrl.format(extid))
def download_storepage(self, extid, extdir):
extpageresult = requests.get(self.detail_url.format(extid))
with open(os.path.join(extdir, 'storepage.html'), 'w') as f:
f.write(extpageresult.text)
def handleExtension(self, extinfo):
def handle_extension(self, extinfo):
extid = extinfo[0]
if not self.regexExtid.match(extid):
raise CrawlError(extid, '{} is not a valid extension id.\n'.format(extid))
@ -64,12 +66,13 @@ class ExtensionCrawler:
with open(os.path.join(extdir, 'metadata.json'), 'w') as f:
json.dump(extinfo, f, indent=5)
self.downloadStorepage(extid, extdir)
self.downloadExtension(extid, extdir)
self.download_storepage(extid, extdir)
self.download_extension(extid, extdir)
def run(self, nrExtensions):
def run(self, category, nrExtensions):
sys.stdout.write('Downloading extensions into folder "{}"...\n'.format(self.basedir))
bigjson = json.loads(requests.post(self.extensionListUrl.format(nrExtensions)).text.lstrip(")]}'\n"))
response = requests.post(self.extension_list_url.format(nrExtensions, category)).text
bigjson = json.loads(response.lstrip(")]}'\n"))
extinfos = bigjson[1][1]
newExtensions = 0
@ -78,7 +81,7 @@ class ExtensionCrawler:
sys.stdout.write('Processing extension "{}"...'.format(extid))
sys.stdout.flush()
try:
self.handleExtension(extinfo)
self.handle_extension(extinfo)
sys.stdout.write('Done!\n')
newExtensions += 1
except CrawlError as cerr:
@ -87,11 +90,25 @@ class ExtensionCrawler:
sys.stderr.write('Page content was:\n')
sys.stderr.write('{}\n'.format(cerr.pagecontent))
except UnauthorizedError as uerr:
sys.stdout.write('Error: login needed')
sys.stdout.write('Error: login needed\n')
sys.stdout.write('Downloaded {} new extensions.\n'.format(newExtensions))
if __name__ == '__main__':
crawler = ExtensionCrawler('downloaded')
parser = argparse.ArgumentParser(description='Downloads extensions from the Chrome Web Store.')
parser.add_argument('-t', '--interval', nargs='?', const=5, type=int, help='Keep downloading extensions every X seconds.')
parser.add_argument('-i', '--iterate', metavar='i', default=1, type=int, help='Queries the store i times for a list of extensions.')
parser.add_argument('-n', '--nrexts', metavar='N', default=200, type=int, help='The number of extensions to be downloaded per request (Google does not accept values much higher than 200).')
parser.add_argument('-c', '--category', default='extensions', choices=ExtensionCrawler.possible_categories, help='The extension category from which extensions should be downloaded.')
parser.add_argument('-d', '--dest', default='downloaded', help='The directory in which the downloaded extensions should be stored.')
# 200 extensions is roughly the maximum number allowed
crawler.run(200)
args = parser.parse_args()
crawler = ExtensionCrawler(args.dest)
if args.interval:
while True:
for i in range(args.iterate):
crawler.run(args.category, args.nrexts)
time.sleep(args.interval)
else:
for i in range(args.iterate):
crawler.run(args.category, args.nrexts)