Use cchardet for detecting text encoding.

2019-08-13 00:50:12 +01:00 · 2019-08-13 00:50:12 +01:00 · ade5d3ae4e
parent 5c9ab535f4
commit ade5d3ae4e
1 changed files with 9 additions and 7 deletions
--- a/16
+++ b/16
@ -33,6 +33,8 @@ import argparse
 import re
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, parse_qs, urlunparse
+import cchardet as chardet
+

 def sanitize_safelink(url):
    if "safelinks.protection.outlook.com" in url:
@ -76,17 +78,17 @@ def main():

 # parse command line
    if args.file:
-        fhandle = open(args.file)
+        fhandle = open(args.file, mode="rb")
    else:
-        fhandle = sys.stdin
+        fhandle = sys.stdin.buffer
    
-    content = ""
-    for line in fhandle:
-        content += line
+    data = fhandle.read()

-    if fhandle is not sys.stdin:
+    if fhandle is not sys.stdin.buffer:
        fhandle.close()

+    content = data.decode(encoding=chardet.detect(data)['encoding'], errors="replace")
+
    if args.html:
        content=unsanitize_html(content)
    else:
@ -103,4 +105,4 @@ def main():
        print(content)

 if __name__ == "__main__":
-    main()
+    main()