Improved crawler script.
This commit is contained in:
parent
d669b39493
commit
5b1bcd3a88
51
crawler.py
51
crawler.py
|
@ -5,6 +5,7 @@ import sys
|
|||
import os
|
||||
import json
|
||||
import re
|
||||
import argparse
|
||||
|
||||
class Error(Exception):
|
||||
pass
|
||||
|
@ -20,18 +21,19 @@ class UnauthorizedError(Error):
|
|||
self.extid = extid
|
||||
|
||||
class ExtensionCrawler:
|
||||
possible_categories = ['extensions', 'ext/22-accessibility', 'ext/10-blogging', 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', 'ext/13-sports']
|
||||
regexExtid = re.compile(r'^[a-z]+$')
|
||||
regexExtfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
||||
|
||||
downloadUrl = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
|
||||
extensionListUrl = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count=200&category=extensions'
|
||||
detailUrl = 'https://chrome.google.com/webstore/detail/{}'
|
||||
download_url = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
|
||||
extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count={}&category={}'
|
||||
detail_url = 'https://chrome.google.com/webstore/detail/{}'
|
||||
|
||||
def __init__(self, basedir):
|
||||
self.basedir = basedir
|
||||
|
||||
def downloadExtension(self, extid, extdir=""):
|
||||
extresult = requests.get(self.downloadUrl.format(extid), stream=True)
|
||||
def download_extension(self, extid, extdir=""):
|
||||
extresult = requests.get(self.download_url.format(extid), stream=True)
|
||||
if extresult.status_code == 401:
|
||||
raise UnauthorizedError(extid)
|
||||
if not 'Content-Type' in extresult.headers:
|
||||
|
@ -46,12 +48,12 @@ class ExtensionCrawler:
|
|||
if chunk: # filter out keep-alive new chunks
|
||||
f.write(chunk)
|
||||
|
||||
def downloadStorepage(self, extid, extdir):
|
||||
extpageresult = requests.get(self.detailUrl.format(extid))
|
||||
def download_storepage(self, extid, extdir):
|
||||
extpageresult = requests.get(self.detail_url.format(extid))
|
||||
with open(os.path.join(extdir, 'storepage.html'), 'w') as f:
|
||||
f.write(extpageresult.text)
|
||||
|
||||
def handleExtension(self, extinfo):
|
||||
def handle_extension(self, extinfo):
|
||||
extid = extinfo[0]
|
||||
if not self.regexExtid.match(extid):
|
||||
raise CrawlError(extid, '{} is not a valid extension id.\n'.format(extid))
|
||||
|
@ -64,12 +66,13 @@ class ExtensionCrawler:
|
|||
with open(os.path.join(extdir, 'metadata.json'), 'w') as f:
|
||||
json.dump(extinfo, f, indent=5)
|
||||
|
||||
self.downloadStorepage(extid, extdir)
|
||||
self.downloadExtension(extid, extdir)
|
||||
self.download_storepage(extid, extdir)
|
||||
self.download_extension(extid, extdir)
|
||||
|
||||
def run(self, nrExtensions):
|
||||
def run(self, category, nrExtensions):
|
||||
sys.stdout.write('Downloading extensions into folder "{}"...\n'.format(self.basedir))
|
||||
bigjson = json.loads(requests.post(self.extensionListUrl.format(nrExtensions)).text.lstrip(")]}'\n"))
|
||||
response = requests.post(self.extension_list_url.format(nrExtensions, category)).text
|
||||
bigjson = json.loads(response.lstrip(")]}'\n"))
|
||||
extinfos = bigjson[1][1]
|
||||
|
||||
newExtensions = 0
|
||||
|
@ -78,7 +81,7 @@ class ExtensionCrawler:
|
|||
sys.stdout.write('Processing extension "{}"...'.format(extid))
|
||||
sys.stdout.flush()
|
||||
try:
|
||||
self.handleExtension(extinfo)
|
||||
self.handle_extension(extinfo)
|
||||
sys.stdout.write('Done!\n')
|
||||
newExtensions += 1
|
||||
except CrawlError as cerr:
|
||||
|
@ -87,11 +90,25 @@ class ExtensionCrawler:
|
|||
sys.stderr.write('Page content was:\n')
|
||||
sys.stderr.write('{}\n'.format(cerr.pagecontent))
|
||||
except UnauthorizedError as uerr:
|
||||
sys.stdout.write('Error: login needed')
|
||||
sys.stdout.write('Error: login needed\n')
|
||||
sys.stdout.write('Downloaded {} new extensions.\n'.format(newExtensions))
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawler = ExtensionCrawler('downloaded')
|
||||
parser = argparse.ArgumentParser(description='Downloads extensions from the Chrome Web Store.')
|
||||
parser.add_argument('-t', '--interval', nargs='?', const=5, type=int, help='Keep downloading extensions every X seconds.')
|
||||
parser.add_argument('-i', '--iterate', metavar='i', default=1, type=int, help='Queries the store i times for a list of extensions.')
|
||||
parser.add_argument('-n', '--nrexts', metavar='N', default=200, type=int, help='The number of extensions to be downloaded per request (Google does not accept values much higher than 200).')
|
||||
parser.add_argument('-c', '--category', default='extensions', choices=ExtensionCrawler.possible_categories, help='The extension category from which extensions should be downloaded.')
|
||||
parser.add_argument('-d', '--dest', default='downloaded', help='The directory in which the downloaded extensions should be stored.')
|
||||
|
||||
# 200 extensions is roughly the maximum number allowed
|
||||
crawler.run(200)
|
||||
args = parser.parse_args()
|
||||
crawler = ExtensionCrawler(args.dest)
|
||||
|
||||
if args.interval:
|
||||
while True:
|
||||
for i in range(args.iterate):
|
||||
crawler.run(args.category, args.nrexts)
|
||||
time.sleep(args.interval)
|
||||
else:
|
||||
for i in range(args.iterate):
|
||||
crawler.run(args.category, args.nrexts)
|
||||
|
|
Loading…
Reference in New Issue