Re-formatting.

This commit is contained in:
Achim D. Brucker 2017-01-14 20:34:58 +00:00
parent a67aaf0501
commit e096b9a2d5
1 changed files with 31 additions and 22 deletions

View File

@ -80,7 +80,7 @@ class ExtensionCrawler:
self.weak_exists_check = weak
self.google_dos_count = 0
self.overview_only = overview
def sha256(self, fname):
hash_sha256 = hashlib.sha256()
with open(fname, "rb") as f:
@ -96,10 +96,10 @@ class ExtensionCrawler:
with open(name + ".url", 'w') as f:
f.write(str(request.url))
def google_dos_protection(self, name, request,max=3):
def google_dos_protection(self, name, request, max=3):
if max >= 1:
sleep(randint(1, max) * .5)
if request.status_code == 503:
if 0 < request.text.find('CAPTCHA'):
print(" Warning: Captcha (" + name + ")")
@ -145,8 +145,7 @@ class ExtensionCrawler:
raise CrawlError(
extid,
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
format(extresult.headers['Content-Type']),
'\n'.join(text))
format(extresult.headers['Content-Type']), '\n'.join(text))
if not self.regex_extfilename.match(extfilename):
raise CrawlError(
extid,
@ -163,7 +162,7 @@ class ExtensionCrawler:
self.store_request_metadata(
os.path.join(extdir, 'storepage.html'), extpageresult)
self.google_dos_protection(
os.path.join(extdir, 'storepage.html'), extpageresult,0.1)
os.path.join(extdir, 'storepage.html'), extpageresult, 0.1)
with open(os.path.join(extdir, 'storepage.html'), 'w') as f:
f.write(extpageresult.text)
@ -234,7 +233,12 @@ class ExtensionCrawler:
os.path.dirname(os.path.relpath(path, self.basedir)))[1]
return self.httpdate(dateutil.parser.parse(utc))
def update_extension(self, extid, overwrite, extinfo=None,cnt=None,max_cnt=None):
def update_extension(self,
extid,
overwrite,
extinfo=None,
cnt=None,
max_cnt=None):
if not self.regex_extid.match(extid):
raise CrawlError(extid,
'{} is not a valid extension id.\n'.format(extid))
@ -248,13 +252,13 @@ class ExtensionCrawler:
if max_cnt != None:
sys.stdout.write("/{}".format(max_cnt))
sys.stdout.write(") ".format(max_cnt))
if self.overview_only:
sys.stdout.write("overview page of ")
else:
sys.stdout.write("full data set of ")
sys.stdout.write("extension {}\n".format(extid))
download_date = datetime.now(timezone.utc).isoformat()
extdir = os.path.join(self.basedir, extid, download_date)
if (not overwrite
@ -262,7 +266,7 @@ class ExtensionCrawler:
if self.verbose:
print(" already archived")
return False
os.makedirs(extdir)
self.download_storepage(extid, extdir)
@ -270,7 +274,7 @@ class ExtensionCrawler:
self.download_storepage(extid, extdir)
if self.overview_only:
return True
old_archives = []
for archive in glob.glob(self.basedir + "/" + extid + "/*/*.crx"):
if os.path.isfile(archive):
@ -319,7 +323,6 @@ class ExtensionCrawler:
return True
def update_extension_list(self, extensions):
n_attempts = 0
n_success = 0
@ -329,12 +332,13 @@ class ExtensionCrawler:
for extid in extensions:
try:
n_attempts += 1
self.update_extension(extid, True,None,n_attempts,len(extensions))
self.update_extension(extid, True, None, n_attempts,
len(extensions))
n_success += 1
except CrawlError as cerr:
retry_extids.append(extid)
sys.stdout.write(' Error: {}\n'.format(cerr.message))
n_errors +=1
n_errors += 1
if cerr.pagecontent != "":
sys.stderr.write(' Page content was:\n')
sys.stderr.write(' {}\n'.format(cerr.pagecontent))
@ -345,24 +349,25 @@ class ExtensionCrawler:
except ConnectionResetError as cerr:
retry_extids.append(extid)
sys.stdout.write(' Error: {}\n'.format(str(cerr)))
n_errors +=1
n_errors += 1
sys.stdout.flush()
if self.verbose:
print("*** Summary: Updated {} of {} extensions successfully".
format(n_success, n_attempts))
print("*** Login required: {}".format(n_login_required))
print("*** Hit Google DOS protection: {}".format(self.google_dos_count))
print("*** Hit Google DOS protection: {}".format(
self.google_dos_count))
print("*** Other Erros: {}".format(n_errors))
sys.stdout.flush()
return retry_extids
def update_extensions(self):
extensions = os.listdir(self.basedir)
retry = self.update_extension_list(extensions)
if retry != []:
sys.stdout.write('\n\n')
sys.stdout.write('Re-trying failed downloads ... \n')
sys.stdout.write('\n\n')
sys.stdout.write('Re-trying failed downloads ... \n')
sys.stdout.flush()
self.update_extension_list(retry)
@ -465,7 +470,10 @@ if __name__ == '__main__':
parser.add_argument(
'-v', '--verbose', action='store_true', help='Increase verbosity.')
parser.add_argument(
'-o', '--overview', action='store_true', help='Only download/update overview page.')
'-o',
'--overview',
action='store_true',
help='Only download/update overview page.')
parser.add_argument(
'-w',
'--weak',
@ -473,7 +481,8 @@ if __name__ == '__main__':
help='weak check if crx exists already')
args = parser.parse_args()
crawler = ExtensionCrawler(args.dest, args.verbose, args.weak,args.overview)
crawler = ExtensionCrawler(args.dest, args.verbose, args.weak,
args.overview)
if args.discover:
if args.interval: