diff --git a/crawler.py b/crawler.py index 999b23c..4cd1163 100755 --- a/crawler.py +++ b/crawler.py @@ -5,6 +5,7 @@ import sys import os import json import re +import argparse class Error(Exception): pass @@ -20,18 +21,19 @@ class UnauthorizedError(Error): self.extid = extid class ExtensionCrawler: + possible_categories = ['extensions', 'ext/22-accessibility', 'ext/10-blogging', 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', 'ext/13-sports'] regexExtid = re.compile(r'^[a-z]+$') regexExtfilename = re.compile(r'^extension[_0-9]+\.crx$') - downloadUrl = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc' - extensionListUrl = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count=200&category=extensions' - detailUrl = 'https://chrome.google.com/webstore/detail/{}' + download_url = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc' + extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count={}&category={}' + detail_url = 'https://chrome.google.com/webstore/detail/{}' def __init__(self, basedir): self.basedir = basedir - def downloadExtension(self, extid, extdir=""): - extresult = requests.get(self.downloadUrl.format(extid), stream=True) + def download_extension(self, extid, extdir=""): + extresult = requests.get(self.download_url.format(extid), stream=True) if extresult.status_code == 401: raise UnauthorizedError(extid) if not 'Content-Type' in extresult.headers: @@ -46,12 +48,12 @@ class ExtensionCrawler: if chunk: # filter out keep-alive new chunks f.write(chunk) - def downloadStorepage(self, extid, extdir): - extpageresult = requests.get(self.detailUrl.format(extid)) + def download_storepage(self, extid, extdir): + extpageresult = requests.get(self.detail_url.format(extid)) with open(os.path.join(extdir, 'storepage.html'), 'w') as f: f.write(extpageresult.text) - def handleExtension(self, extinfo): + def handle_extension(self, extinfo): extid = extinfo[0] if not self.regexExtid.match(extid): raise CrawlError(extid, '{} is not a valid extension id.\n'.format(extid)) @@ -64,12 +66,13 @@ class ExtensionCrawler: with open(os.path.join(extdir, 'metadata.json'), 'w') as f: json.dump(extinfo, f, indent=5) - self.downloadStorepage(extid, extdir) - self.downloadExtension(extid, extdir) + self.download_storepage(extid, extdir) + self.download_extension(extid, extdir) - def run(self, nrExtensions): + def run(self, category, nrExtensions): sys.stdout.write('Downloading extensions into folder "{}"...\n'.format(self.basedir)) - bigjson = json.loads(requests.post(self.extensionListUrl.format(nrExtensions)).text.lstrip(")]}'\n")) + response = requests.post(self.extension_list_url.format(nrExtensions, category)).text + bigjson = json.loads(response.lstrip(")]}'\n")) extinfos = bigjson[1][1] newExtensions = 0 @@ -78,7 +81,7 @@ class ExtensionCrawler: sys.stdout.write('Processing extension "{}"...'.format(extid)) sys.stdout.flush() try: - self.handleExtension(extinfo) + self.handle_extension(extinfo) sys.stdout.write('Done!\n') newExtensions += 1 except CrawlError as cerr: @@ -87,11 +90,25 @@ class ExtensionCrawler: sys.stderr.write('Page content was:\n') sys.stderr.write('{}\n'.format(cerr.pagecontent)) except UnauthorizedError as uerr: - sys.stdout.write('Error: login needed') + sys.stdout.write('Error: login needed\n') sys.stdout.write('Downloaded {} new extensions.\n'.format(newExtensions)) if __name__ == '__main__': - crawler = ExtensionCrawler('downloaded') + parser = argparse.ArgumentParser(description='Downloads extensions from the Chrome Web Store.') + parser.add_argument('-t', '--interval', nargs='?', const=5, type=int, help='Keep downloading extensions every X seconds.') + parser.add_argument('-i', '--iterate', metavar='i', default=1, type=int, help='Queries the store i times for a list of extensions.') + parser.add_argument('-n', '--nrexts', metavar='N', default=200, type=int, help='The number of extensions to be downloaded per request (Google does not accept values much higher than 200).') + parser.add_argument('-c', '--category', default='extensions', choices=ExtensionCrawler.possible_categories, help='The extension category from which extensions should be downloaded.') + parser.add_argument('-d', '--dest', default='downloaded', help='The directory in which the downloaded extensions should be stored.') - # 200 extensions is roughly the maximum number allowed - crawler.run(200) + args = parser.parse_args() + crawler = ExtensionCrawler(args.dest) + + if args.interval: + while True: + for i in range(args.iterate): + crawler.run(args.category, args.nrexts) + time.sleep(args.interval) + else: + for i in range(args.iterate): + crawler.run(args.category, args.nrexts)