Improved crawler script.
This commit is contained in:
parent
d669b39493
commit
5b1bcd3a88
51
crawler.py
51
crawler.py
|
@ -5,6 +5,7 @@ import sys
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
import argparse
|
||||||
|
|
||||||
class Error(Exception):
|
class Error(Exception):
|
||||||
pass
|
pass
|
||||||
|
@ -20,18 +21,19 @@ class UnauthorizedError(Error):
|
||||||
self.extid = extid
|
self.extid = extid
|
||||||
|
|
||||||
class ExtensionCrawler:
|
class ExtensionCrawler:
|
||||||
|
possible_categories = ['extensions', 'ext/22-accessibility', 'ext/10-blogging', 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', 'ext/13-sports']
|
||||||
regexExtid = re.compile(r'^[a-z]+$')
|
regexExtid = re.compile(r'^[a-z]+$')
|
||||||
regexExtfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
regexExtfilename = re.compile(r'^extension[_0-9]+\.crx$')
|
||||||
|
|
||||||
downloadUrl = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
|
download_url = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
|
||||||
extensionListUrl = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count=200&category=extensions'
|
extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count={}&category={}'
|
||||||
detailUrl = 'https://chrome.google.com/webstore/detail/{}'
|
detail_url = 'https://chrome.google.com/webstore/detail/{}'
|
||||||
|
|
||||||
def __init__(self, basedir):
|
def __init__(self, basedir):
|
||||||
self.basedir = basedir
|
self.basedir = basedir
|
||||||
|
|
||||||
def downloadExtension(self, extid, extdir=""):
|
def download_extension(self, extid, extdir=""):
|
||||||
extresult = requests.get(self.downloadUrl.format(extid), stream=True)
|
extresult = requests.get(self.download_url.format(extid), stream=True)
|
||||||
if extresult.status_code == 401:
|
if extresult.status_code == 401:
|
||||||
raise UnauthorizedError(extid)
|
raise UnauthorizedError(extid)
|
||||||
if not 'Content-Type' in extresult.headers:
|
if not 'Content-Type' in extresult.headers:
|
||||||
|
@ -46,12 +48,12 @@ class ExtensionCrawler:
|
||||||
if chunk: # filter out keep-alive new chunks
|
if chunk: # filter out keep-alive new chunks
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
|
||||||
def downloadStorepage(self, extid, extdir):
|
def download_storepage(self, extid, extdir):
|
||||||
extpageresult = requests.get(self.detailUrl.format(extid))
|
extpageresult = requests.get(self.detail_url.format(extid))
|
||||||
with open(os.path.join(extdir, 'storepage.html'), 'w') as f:
|
with open(os.path.join(extdir, 'storepage.html'), 'w') as f:
|
||||||
f.write(extpageresult.text)
|
f.write(extpageresult.text)
|
||||||
|
|
||||||
def handleExtension(self, extinfo):
|
def handle_extension(self, extinfo):
|
||||||
extid = extinfo[0]
|
extid = extinfo[0]
|
||||||
if not self.regexExtid.match(extid):
|
if not self.regexExtid.match(extid):
|
||||||
raise CrawlError(extid, '{} is not a valid extension id.\n'.format(extid))
|
raise CrawlError(extid, '{} is not a valid extension id.\n'.format(extid))
|
||||||
|
@ -64,12 +66,13 @@ class ExtensionCrawler:
|
||||||
with open(os.path.join(extdir, 'metadata.json'), 'w') as f:
|
with open(os.path.join(extdir, 'metadata.json'), 'w') as f:
|
||||||
json.dump(extinfo, f, indent=5)
|
json.dump(extinfo, f, indent=5)
|
||||||
|
|
||||||
self.downloadStorepage(extid, extdir)
|
self.download_storepage(extid, extdir)
|
||||||
self.downloadExtension(extid, extdir)
|
self.download_extension(extid, extdir)
|
||||||
|
|
||||||
def run(self, nrExtensions):
|
def run(self, category, nrExtensions):
|
||||||
sys.stdout.write('Downloading extensions into folder "{}"...\n'.format(self.basedir))
|
sys.stdout.write('Downloading extensions into folder "{}"...\n'.format(self.basedir))
|
||||||
bigjson = json.loads(requests.post(self.extensionListUrl.format(nrExtensions)).text.lstrip(")]}'\n"))
|
response = requests.post(self.extension_list_url.format(nrExtensions, category)).text
|
||||||
|
bigjson = json.loads(response.lstrip(")]}'\n"))
|
||||||
extinfos = bigjson[1][1]
|
extinfos = bigjson[1][1]
|
||||||
|
|
||||||
newExtensions = 0
|
newExtensions = 0
|
||||||
|
@ -78,7 +81,7 @@ class ExtensionCrawler:
|
||||||
sys.stdout.write('Processing extension "{}"...'.format(extid))
|
sys.stdout.write('Processing extension "{}"...'.format(extid))
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
try:
|
try:
|
||||||
self.handleExtension(extinfo)
|
self.handle_extension(extinfo)
|
||||||
sys.stdout.write('Done!\n')
|
sys.stdout.write('Done!\n')
|
||||||
newExtensions += 1
|
newExtensions += 1
|
||||||
except CrawlError as cerr:
|
except CrawlError as cerr:
|
||||||
|
@ -87,11 +90,25 @@ class ExtensionCrawler:
|
||||||
sys.stderr.write('Page content was:\n')
|
sys.stderr.write('Page content was:\n')
|
||||||
sys.stderr.write('{}\n'.format(cerr.pagecontent))
|
sys.stderr.write('{}\n'.format(cerr.pagecontent))
|
||||||
except UnauthorizedError as uerr:
|
except UnauthorizedError as uerr:
|
||||||
sys.stdout.write('Error: login needed')
|
sys.stdout.write('Error: login needed\n')
|
||||||
sys.stdout.write('Downloaded {} new extensions.\n'.format(newExtensions))
|
sys.stdout.write('Downloaded {} new extensions.\n'.format(newExtensions))
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
crawler = ExtensionCrawler('downloaded')
|
parser = argparse.ArgumentParser(description='Downloads extensions from the Chrome Web Store.')
|
||||||
|
parser.add_argument('-t', '--interval', nargs='?', const=5, type=int, help='Keep downloading extensions every X seconds.')
|
||||||
|
parser.add_argument('-i', '--iterate', metavar='i', default=1, type=int, help='Queries the store i times for a list of extensions.')
|
||||||
|
parser.add_argument('-n', '--nrexts', metavar='N', default=200, type=int, help='The number of extensions to be downloaded per request (Google does not accept values much higher than 200).')
|
||||||
|
parser.add_argument('-c', '--category', default='extensions', choices=ExtensionCrawler.possible_categories, help='The extension category from which extensions should be downloaded.')
|
||||||
|
parser.add_argument('-d', '--dest', default='downloaded', help='The directory in which the downloaded extensions should be stored.')
|
||||||
|
|
||||||
# 200 extensions is roughly the maximum number allowed
|
args = parser.parse_args()
|
||||||
crawler.run(200)
|
crawler = ExtensionCrawler(args.dest)
|
||||||
|
|
||||||
|
if args.interval:
|
||||||
|
while True:
|
||||||
|
for i in range(args.iterate):
|
||||||
|
crawler.run(args.category, args.nrexts)
|
||||||
|
time.sleep(args.interval)
|
||||||
|
else:
|
||||||
|
for i in range(args.iterate):
|
||||||
|
crawler.run(args.category, args.nrexts)
|
||||||
|
|
Loading…
Reference in New Issue