Code reformatting.

This commit is contained in:
Achim D. Brucker 2016-12-02 22:35:52 +00:00
parent a88b94eb5e
commit 85d65ae146
3 changed files with 243 additions and 110 deletions

113
crawler.py Executable file → Normal file
View File

@ -24,35 +24,46 @@ import json
import re
import argparse
class Error(Exception):
pass
class StoreError(Error):
def __init__(self, message, pagecontent=""):
self.message = message
self.pagecontent = pagecontent
class CrawlError(Error):
def __init__(self, extid, message, pagecontent=""):
self.extid = extid
self.message = message
self.pagecontent = pagecontent
class UnauthorizedError(Error):
def __init__(self, extid):
self.extid = extid
class ExtensionCrawler:
possible_categories = ['extensions', 'ext/22-accessibility', 'ext/10-blogging', 'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun', 'ext/6-news', 'ext/28-photos', 'ext/7-productivity', 'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication', 'ext/13-sports']
regex_extid = re.compile(r'^[a-z]+$')
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
possible_categories = [
'extensions', 'ext/22-accessibility', 'ext/10-blogging',
'ext/15-by-google', 'ext/11-web-development', 'ext/14-fun',
'ext/6-news', 'ext/28-photos', 'ext/7-productivity',
'ext/38-search-tools', 'ext/12-shopping', 'ext/1-communication',
'ext/13-sports'
]
regex_extid = re.compile(r'^[a-z]+$')
regex_extfilename = re.compile(r'^extension[_0-9]+\.crx$')
regex_store_date_string = re.compile(r'"([0-9]{8})"')
download_url = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv={}&count={}&category={}'
download_url = 'https://clients2.google.com/service/update2/crx?response=redirect&nacl_arch=x86-64&prodversion=9999.0.9999.0&x=id%3D{}%26uc'
extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv={}&count={}&category={}'
#extension_list_url = 'https://chrome.google.com/webstore/ajax/item?pv=20160822&count={}&category={}'
detail_url = 'https://chrome.google.com/webstore/detail/{}'
store_url = 'https://chrome.google.com/webstore'
detail_url = 'https://chrome.google.com/webstore/detail/{}'
store_url = 'https://chrome.google.com/webstore'
def __init__(self, basedir):
self.basedir = basedir
@ -62,15 +73,24 @@ class ExtensionCrawler:
if extresult.status_code == 401:
raise UnauthorizedError(extid)
if not 'Content-Type' in extresult.headers:
raise CrawlError(extid, 'Did not find Content-Type header.', '\n'.join(extresult.iter_lines()))
if not extresult.headers['Content-Type'] == 'application/x-chrome-extension':
raise CrawlError(extid, 'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.format(extresult.headers['Content-Type']), '\n'.join(extresult.iter_lines()))
raise CrawlError(extid, 'Did not find Content-Type header.',
'\n'.join(extresult.iter_lines()))
if not extresult.headers[
'Content-Type'] == 'application/x-chrome-extension':
raise CrawlError(
extid,
'Expected Content-Type header to be application/x-chrome-extension, but got {}.'.
format(extresult.headers['Content-Type']),
'\n'.join(extresult.iter_lines()))
extfilename = os.path.basename(extresult.url)
if not self.regex_extfilename.match(extfilename):
raise CrawlError(extid, '{} is not a valid extension file name, skipping...'.format(extfilename))
raise CrawlError(
extid,
'{} is not a valid extension file name, skipping...'.format(
extfilename))
with open(os.path.join(extdir, extfilename), 'wb') as f:
for chunk in extresult.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
for chunk in extresult.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
def download_storepage(self, extid, extdir):
@ -81,7 +101,8 @@ class ExtensionCrawler:
def handle_extension(self, extinfo, category=''):
extid = extinfo[0]
if not self.regex_extid.match(extid):
raise CrawlError(extid, '{} is not a valid extension id.\n'.format(extid))
raise CrawlError(extid,
'{} is not a valid extension id.\n'.format(extid))
extdir = os.path.join(self.basedir, category, extid)
if os.path.isdir(extdir):
return False
@ -100,13 +121,17 @@ class ExtensionCrawler:
response = requests.get(self.store_url).text
match = re.search(self.regex_store_date_string, response)
if not match:
raise StoreError('Could not find the date string in the response from {}.'.format(self.store_url), response)
raise StoreError(
'Could not find the date string in the response from {}.'.
format(self.store_url), response)
return match.group(1)
def run(self, categories, nrExtensions):
date_string = self.get_store_date_string()
for category in categories:
response = requests.post(self.extension_list_url.format(date_string, nrExtensions, category)).text
response = requests.post(
self.extension_list_url.format(date_string, nrExtensions,
category)).text
bigjson = json.loads(response.lstrip(")]}'\n"))
extinfos = bigjson[1][1]
@ -114,7 +139,11 @@ class ExtensionCrawler:
for i in range(len(extinfos)):
extid = extinfos[i][0]
try:
sys.stdout.write('\rDownloading into {} ... {} of {} done ({} new ones)'.format(os.path.join(self.basedir, category), i, len(extinfos), newExtensions))
sys.stdout.write(
'\rDownloading into {} ... {} of {} done ({} new ones)'.
format(
os.path.join(self.basedir, category), i,
len(extinfos), newExtensions))
sys.stdout.flush()
if self.handle_extension(extinfos[i], category):
newExtensions += 1
@ -125,15 +154,51 @@ class ExtensionCrawler:
sys.stderr.write('{}\n'.format(cerr.pagecontent))
except UnauthorizedError as uerr:
sys.stdout.write('Error: login needed\n')
sys.stdout.write('\rDownloading into {} ... {} of {} done ({} new ones)\n'.format(os.path.join(self.basedir, category), len(extinfos), len(extinfos), newExtensions))
sys.stdout.write(
'\rDownloading into {} ... {} of {} done ({} new ones)\n'.
format(
os.path.join(self.basedir, category),
len(extinfos), len(extinfos), newExtensions))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Downloads extensions from the Chrome Web Store.')
parser.add_argument('-t', '--interval', nargs='?', const=5, type=int, help='Keep downloading extensions every X seconds.')
parser.add_argument('-i', '--iterate', metavar='i', default=1, type=int, help='Queries the store i times for a list of extensions.')
parser.add_argument('-n', '--nrexts', metavar='N', default=200, type=int, help='The number of extensions to be downloaded per request (Google does not accept values much higher than 200).')
parser.add_argument('-c', '--categories', nargs='*', default=ExtensionCrawler.possible_categories, choices=ExtensionCrawler.possible_categories, help='Only download extensions from the specified categories.')
parser.add_argument('-d', '--dest', default='downloaded', help='The directory in which the downloaded extensions should be stored.')
parser = argparse.ArgumentParser(
description='Downloads extensions from the Chrome Web Store.')
parser.add_argument(
'-t',
'--interval',
nargs='?',
const=5,
type=int,
help='Keep downloading extensions every X seconds.')
parser.add_argument(
'-i',
'--iterate',
metavar='i',
default=1,
type=int,
help='Queries the store i times for a list of extensions.')
parser.add_argument(
'-n',
'--nrexts',
metavar='N',
default=200,
type=int,
help='The number of extensions to be downloaded per request (Google does not accept values much higher than 200).'
)
parser.add_argument(
'-c',
'--categories',
nargs='*',
default=ExtensionCrawler.possible_categories,
choices=ExtensionCrawler.possible_categories,
help='Only download extensions from the specified categories.')
parser.add_argument(
'-d',
'--dest',
default='downloaded',
help='The directory in which the downloaded extensions should be stored.'
)
args = parser.parse_args()
crawler = ExtensionCrawler(args.dest)

155
crx-tool.py Executable file → Normal file
View File

@ -26,121 +26,142 @@ from Crypto.Signature import PKCS1_v1_5
import zipfile
import io
class CrxFile:
def __init__(self,filename,magic,version,pk_len,sig_len,pk,sig,header_len,data):
self.file = filename
self.magic = magic
self.version = version
self.pk_len = pk_len
self.sig_len = sig_len
self.pk = pk
self.sig = sig
def __init__(self, filename, magic, version, pk_len, sig_len, pk, sig,
header_len, data):
self.file = filename
self.magic = magic
self.version = version
self.pk_len = pk_len
self.sig_len = sig_len
self.pk = pk
self.sig = sig
self.header_len = header_len
self.data = data
self.data = data
def is_valid_magic(magic):
return (b'Cr24' == magic)
def is_crxfile (filename):
def is_crxfile(filename):
"Check magic number: crx files should start with \"Cr24\"."
file = open (filename, 'rb')
magic = file.read(4)
file = open(filename, 'rb')
magic = file.read(4)
file.close()
return is_valid_magic(magic)
def check_signature(pk,sig,data):
key = RSA.importKey(pk)
def check_signature(pk, sig, data):
key = RSA.importKey(pk)
hash = SHA.new(data)
return PKCS1_v1_5.new(key).verify(hash, sig)
def read_crx(filename):
"Read header of an crx file (https://developer.chrome.com/extensions/crx)."
file = open (filename, 'rb')
magic = file.read(4)
version = int.from_bytes(file.read(4), byteorder='little')
pk_len = int.from_bytes(file.read(4), byteorder='little')
sig_len = int.from_bytes(file.read(4), byteorder='little')
pk = file.read(pk_len)
sig = file.read(sig_len)
header_len = 16+pk_len+sig_len
data = file.read()
file = open(filename, 'rb')
magic = file.read(4)
version = int.from_bytes(file.read(4), byteorder='little')
pk_len = int.from_bytes(file.read(4), byteorder='little')
sig_len = int.from_bytes(file.read(4), byteorder='little')
pk = file.read(pk_len)
sig = file.read(sig_len)
header_len = 16 + pk_len + sig_len
data = file.read()
file.close()
return CrxFile(filename,magic,version,pk_len,sig_len,pk,sig,header_len,data)
return CrxFile(filename, magic, version, pk_len, sig_len, pk, sig,
header_len, data)
def print_crx_info(verbose,crx):
def print_crx_info(verbose, crx):
if is_valid_magic(crx.magic):
magic="valid"
magic = "valid"
else:
magic="invalid"
if check_signature(crx.pk,crx.sig,crx.data):
sig="valid"
magic = "invalid"
if check_signature(crx.pk, crx.sig, crx.data):
sig = "valid"
else:
sig="invalid"
print("Filename: "+crx.file)
print("Header size: "+str(crx.header_len))
print("Size: "+str(crx.header_len+len(crx.data)))
print("Magic byte: "+str(crx.magic.decode("utf-8"))+" ("+magic+")")
print("Version: "+str(crx.version))
print("Signature: "+sig)
print("Public Key ["+str(crx.pk_len)+"]:")
key = RSA.importKey(crx.pk)
print (key.exportKey().decode("utf-8"))
sig = "invalid"
print("Filename: " + crx.file)
print("Header size: " + str(crx.header_len))
print("Size: " + str(crx.header_len + len(crx.data)))
print("Magic byte: " + str(crx.magic.decode("utf-8")) + " (" + magic +
")")
print("Version: " + str(crx.version))
print("Signature: " + sig)
print("Public Key [" + str(crx.pk_len) + "]:")
key = RSA.importKey(crx.pk)
print(key.exportKey().decode("utf-8"))
if verbose:
print("Signature ["+str(crx.sig_len)+"]: "+str(binascii.hexlify(crx.sig)))
print("Signature [" + str(crx.sig_len) + "]: " + str(
binascii.hexlify(crx.sig)))
out = f = io.BytesIO(crx.data)
zf = zipfile.ZipFile(out, 'r')
print("Zip content:")
for info in zf.infolist():
print('{:8d} {:8d}'.format(info.file_size, info.compress_size), info.filename)
def verify_crxfile (verbose, filename):
print('{:8d} {:8d}'.format(info.file_size, info.compress_size),
info.filename)
def verify_crxfile(verbose, filename):
if is_crxfile(filename):
if verbose:
print("Found correct magic bytes.")
print_crx_info(verbose,read_crx(filename))
return 0
print_crx_info(verbose, read_crx(filename))
return 0
else:
if verbose:
print("No valid magic bytes found")
return -1
def extract_crxfile(verbose, force, filename, destdir):
crx = read_crx(filename)
if is_valid_magic(crx.magic) | force:
if ("" == destdir) | (destdir is None):
destdir = "."
destdir = "."
if filename.endswith(".crx"):
dirname = filename[0:len(filename)-4]
dirname = filename[0:len(filename) - 4]
else:
dirname = filename
out = f = io.BytesIO(crx.data)
zf = zipfile.ZipFile(out, 'r')
zf.extractall(destdir+"/"+dirname)
print ("Content extracted into: "+destdir+"/"+dirname)
zf.extractall(destdir + "/" + dirname)
print("Content extracted into: " + destdir + "/" + dirname)
else:
print ("Input file not valid.")
print("Input file not valid.")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("file", help="chrome extension archive (*.crx)")
parser.add_argument('targetdir', nargs='?', default="")
parser.add_argument("-c", "--check", help="verify format and signature of <file>",
action="store_true")
parser.add_argument("-e", "--extract", help="extract <file>",
action="store_true")
parser.add_argument("-f", "--force", help="apply action also to (potential) invalid files",
action="store_true")
parser.add_argument("-v", "--verbose", help="increase verbosity",
action="store_true")
parser.add_argument(
"-c",
"--check",
help="verify format and signature of <file>",
action="store_true")
parser.add_argument(
"-e", "--extract", help="extract <file>", action="store_true")
parser.add_argument(
"-f",
"--force",
help="apply action also to (potential) invalid files",
action="store_true")
parser.add_argument(
"-v", "--verbose", help="increase verbosity", action="store_true")
args = parser.parse_args()
if args.extract:
retval = extract_crxfile(args.verbose, args.force, args.file, args.targetdir)
else:
retval = extract_crxfile(args.verbose, args.force, args.file,
args.targetdir)
else:
retval = verify_crxfile(args.verbose, args.file)
exit(retval)
exit(retval)
if __name__ == "__main__":
main()

85
permstats.py Executable file → Normal file
View File

@ -26,42 +26,55 @@ import re
regex_concrete_url = re.compile(r'^.*://.*[a-z0-9]+\.[a-z]+.*$')
class PermissionHandlerPrintNames:
def __init__(self, permname):
self.permname = permname
self.extinfo = {}
def handle_permission(self, extid, permobj, path):
if self.permname in str(permobj):
with open(os.path.join(path, 'metadata.json')) as f:
metadata = json.load(f)
self.extinfo[extid] = '{} | {} | {}'.format(metadata[1], metadata[6], path)
self.extinfo[extid] = '{} | {} | {}'.format(metadata[1],
metadata[6], path)
def print_result(self, fileobj, delim):
fileobj.write('Extensions that use permission "{}":\n\n'.format(self.permname))
fileobj.write('Extensions that use permission "{}":\n\n'.format(
self.permname))
for extid in self.extinfo:
fileobj.write('{}\n'.format(self.extinfo[extid]))
fileobj.write('\n\n')
class PermissionHandler:
def __init__(self):
self.permissions = {}
self.extids = set()
def handle_permission(self, extid, permobj, path):
self.extids.add(extid)
perm = str(permobj)
if not perm in self.permissions:
self.permissions[perm] = 0
self.permissions[perm] += 1
def print_result(self, fileobj, delim):
fileobj.write('Total: {} extensions\n'.format(len(self.extids)))
for perm in sorted(self.permissions, key=self.permissions.get, reverse=True):
fileobj.write('{}{}{}{}{:.2%}\n'.format(perm, delim, self.permissions[perm], delim, float(self.permissions[perm]) / len(self.extids)))
for perm in sorted(
self.permissions, key=self.permissions.get, reverse=True):
fileobj.write('{}{}{}{}{:.2%}\n'.format(
perm, delim, self.permissions[perm], delim,
float(self.permissions[perm]) / len(self.extids)))
fileobj.write('\n\n')
class PermissionHandlerCondensed:
def __init__(self):
self.permissions = {}
self.extids = set()
self.exts_with_concrete_urls = set()
def handle_permission(self, extid, permobj, path):
self.extids.add(extid)
@ -74,12 +87,18 @@ class PermissionHandlerCondensed:
if not perm in self.permissions:
self.permissions[perm] = 0
self.permissions[perm] += 1
def print_result(self, fileobj, delim):
fileobj.write('Condensed. Total: {} extensions\n'.format(len(self.extids)))
for perm in sorted(self.permissions, key=self.permissions.get, reverse=True):
fileobj.write('{}{}{}{}{:.2%}\n'.format(perm, delim, self.permissions[perm], delim, float(self.permissions[perm]) / len(self.extids)))
fileobj.write('Condensed. Total: {} extensions\n'.format(
len(self.extids)))
for perm in sorted(
self.permissions, key=self.permissions.get, reverse=True):
fileobj.write('{}{}{}{}{:.2%}\n'.format(
perm, delim, self.permissions[perm], delim,
float(self.permissions[perm]) / len(self.extids)))
fileobj.write('\n\n')
class PermissionStatisticGenerator:
def run(category_folder, permhandlers):
for root, dirs, files in os.walk(category_folder):
@ -99,23 +118,51 @@ class PermissionStatisticGenerator:
if 'permissions' in manifest:
for permobj in manifest['permissions']:
for handler in permhandlers:
handler.handle_permission(extid, permobj, root)
handler.handle_permission(extid, permobj,
root)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Prints statistics about the requested permissions of downloaded extensions.')
parser.add_argument('dir', help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.')
parser.add_argument('-d', '--delim', default='\t', help='Delimiter used for the statistics output.')
parser.add_argument('-o', '--output', default=sys.stdout, type=argparse.FileType('w'), help='Save the statistics into a file.')
parser.add_argument('-p', '--permission', help='Prints out all extension names and descriptions that use the given permission.')
parser.add_argument('-c', '--categories', action='store_true', help='Print the results for each category separately.')
parser = argparse.ArgumentParser(
description='Prints statistics about the requested permissions of downloaded extensions.'
)
parser.add_argument(
'dir',
help='The directory in which the extensions are stored. The directory structure must be {category}/{extid}/*.crx.'
)
parser.add_argument(
'-d',
'--delim',
default='\t',
help='Delimiter used for the statistics output.')
parser.add_argument(
'-o',
'--output',
default=sys.stdout,
type=argparse.FileType('w'),
help='Save the statistics into a file.')
parser.add_argument(
'-p',
'--permission',
help='Prints out all extension names and descriptions that use the given permission.'
)
parser.add_argument(
'-c',
'--categories',
action='store_true',
help='Print the results for each category separately.')
args = parser.parse_args()
category_folders = [args.dir]
category_folders = [args.dir]
if args.categories:
category_folders += [os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1]]
category_folders += [
os.path.join(args.dir, d) for d in next(os.walk(args.dir))[1]
]
for category_folder in category_folders:
args.output.write('Results for category {}:\n\n'.format(category_folder))
for category_folder in category_folders:
args.output.write('Results for category {}:\n\n'.format(
category_folder))
if args.permission:
handlers = [PermissionHandlerPrintNames(args.permission)]
else: