Fix some encoding issues.

This commit is contained in:
Michael Herzberg 2018-07-21 01:50:59 +01:00
parent 250bdd2c6b
commit eb616b0ac3
3 changed files with 6 additions and 11 deletions

View File

@ -30,6 +30,7 @@ import os
import glob
import datetime
import hashlib
from jsmin import jsmin
def get_etag(ext_id, datepath, con):
@ -214,16 +215,6 @@ def parse_and_insert_crx(ext_id, datepath, con):
# Trying a different encoding, manifests are weird...
content = raw_content.decode("latin1")
# Attempt to remove JavaScript-style comments from json
comment_regex = re.compile(r'\s*//.*')
multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/\s*')
lines = content.splitlines()
for index, line in enumerate(lines):
if comment_regex.fullmatch(
line) or multiline_comment_regex.fullmatch(line):
lines[index] = ""
content = "\n".join(lines)
con.insert(
"crx",
crx_etag=etag,
@ -232,7 +223,7 @@ def parse_and_insert_crx(ext_id, datepath, con):
manifest=content,
publickey=public_key)
manifest = json.loads(content, strict=False)
manifest = json.loads(jsmin(content), strict=False)
if "permissions" in manifest:
for permission in manifest["permissions"]:
con.insert(

View File

@ -62,6 +62,9 @@ def get_features(s):
def get_simhash(encoding, data):
"""Compute simhash of text."""
if encoding is not None:
# VISCII is not supported by python, UTF-8 parses at least the for us important parts
if encoding == "VISCII":
encoding = "UTF-8"
str_data = data.decode(encoding=encoding, errors="replace")
else:
str_data = str(data)

View File

@ -13,3 +13,4 @@ GitPython==2.1.5
python_magic==0.4.13
jsbeautifier==1.7.3
pebble==4.3.7
jsmin==2.2.2