Use cchardet for detecting text encoding.

This commit is contained in:
Achim D. Brucker 2019-08-13 00:50:12 +01:00
parent 5c9ab535f4
commit ade5d3ae4e
1 changed files with 9 additions and 7 deletions

View File

@ -33,6 +33,8 @@ import argparse
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs, urlunparse
import cchardet as chardet
def sanitize_safelink(url):
if "safelinks.protection.outlook.com" in url:
@ -76,17 +78,17 @@ def main():
# parse command line
if args.file:
fhandle = open(args.file)
fhandle = open(args.file, mode="rb")
else:
fhandle = sys.stdin
fhandle = sys.stdin.buffer
content = ""
for line in fhandle:
content += line
data = fhandle.read()
if fhandle is not sys.stdin:
if fhandle is not sys.stdin.buffer:
fhandle.close()
content = data.decode(encoding=chardet.detect(data)['encoding'], errors="replace")
if args.html:
content=unsanitize_html(content)
else:
@ -103,4 +105,4 @@ def main():
print(content)
if __name__ == "__main__":
main()
main()