Fix some encoding issues.
This commit is contained in:
parent
250bdd2c6b
commit
eb616b0ac3
|
@ -30,6 +30,7 @@ import os
|
|||
import glob
|
||||
import datetime
|
||||
import hashlib
|
||||
from jsmin import jsmin
|
||||
|
||||
|
||||
def get_etag(ext_id, datepath, con):
|
||||
|
@ -214,16 +215,6 @@ def parse_and_insert_crx(ext_id, datepath, con):
|
|||
# Trying a different encoding, manifests are weird...
|
||||
content = raw_content.decode("latin1")
|
||||
|
||||
# Attempt to remove JavaScript-style comments from json
|
||||
comment_regex = re.compile(r'\s*//.*')
|
||||
multiline_comment_regex = re.compile(r'\s*/\\*.*\\*/\s*')
|
||||
lines = content.splitlines()
|
||||
for index, line in enumerate(lines):
|
||||
if comment_regex.fullmatch(
|
||||
line) or multiline_comment_regex.fullmatch(line):
|
||||
lines[index] = ""
|
||||
content = "\n".join(lines)
|
||||
|
||||
con.insert(
|
||||
"crx",
|
||||
crx_etag=etag,
|
||||
|
@ -232,7 +223,7 @@ def parse_and_insert_crx(ext_id, datepath, con):
|
|||
manifest=content,
|
||||
publickey=public_key)
|
||||
|
||||
manifest = json.loads(content, strict=False)
|
||||
manifest = json.loads(jsmin(content), strict=False)
|
||||
if "permissions" in manifest:
|
||||
for permission in manifest["permissions"]:
|
||||
con.insert(
|
||||
|
|
|
@ -62,6 +62,9 @@ def get_features(s):
|
|||
def get_simhash(encoding, data):
|
||||
"""Compute simhash of text."""
|
||||
if encoding is not None:
|
||||
# VISCII is not supported by python, UTF-8 parses at least the for us important parts
|
||||
if encoding == "VISCII":
|
||||
encoding = "UTF-8"
|
||||
str_data = data.decode(encoding=encoding, errors="replace")
|
||||
else:
|
||||
str_data = str(data)
|
||||
|
|
|
@ -13,3 +13,4 @@ GitPython==2.1.5
|
|||
python_magic==0.4.13
|
||||
jsbeautifier==1.7.3
|
||||
pebble==4.3.7
|
||||
jsmin==2.2.2
|
Loading…
Reference in New Issue