2019-08-10 10:58:04 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# Copyright (c) 2018-2019 Achim D. Brucker.
|
|
|
|
#
|
|
|
|
# Redistribution and use in source and binary forms, with or without
|
|
|
|
# modification, are permitted provided that the following conditions
|
|
|
|
# are met:
|
|
|
|
# 1. Redistributions of source code must retain the above copyright
|
|
|
|
# notice, this list of conditions and the following disclaimer.
|
|
|
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
# notice, this list of conditions and the following disclaimer in
|
|
|
|
# the documentation and/or other materials provided with the
|
|
|
|
# distribution.
|
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
|
|
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
|
|
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
|
|
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
|
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
|
|
# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
# POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
#
|
|
|
|
# SPDX-License-Identifier: BSD-2-Clause
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os.path as path
|
|
|
|
import os
|
|
|
|
import stat
|
|
|
|
import argparse
|
|
|
|
import re
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from urllib.parse import urlparse, parse_qs, urlunparse
|
2019-08-12 23:50:12 +00:00
|
|
|
import cchardet as chardet
|
|
|
|
|
2019-08-10 10:58:04 +00:00
|
|
|
|
|
|
|
def sanitize_safelink(url):
|
|
|
|
if "safelinks.protection.outlook.com" in url:
|
2019-08-15 10:06:35 +00:00
|
|
|
try:
|
|
|
|
target = urlparse(parse_qs(urlparse(url).query)['url'][0])
|
|
|
|
return target.geturl()
|
|
|
|
except:
|
|
|
|
return " Warning: Removed corrupted safelink. "
|
2019-08-10 10:58:04 +00:00
|
|
|
else:
|
|
|
|
return url
|
|
|
|
|
2019-08-15 10:07:05 +00:00
|
|
|
|
2019-08-10 10:58:04 +00:00
|
|
|
def unsanitize_txt(content):
|
2019-08-15 10:07:05 +00:00
|
|
|
url_re = re.compile(
|
|
|
|
r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-\'\/\+-;=\?-@.&+_]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)',
|
|
|
|
re.MULTILINE)
|
|
|
|
return re.sub(url_re, lambda x: (sanitize_safelink(x.group(1))),
|
|
|
|
content).rstrip()
|
|
|
|
|
2019-08-10 10:58:04 +00:00
|
|
|
|
|
|
|
def unsanitize_html(content):
|
|
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
|
|
for a in soup.findAll('a'):
|
|
|
|
if a.has_attr('originalsrc'):
|
|
|
|
a['safelink'] = a['href']
|
|
|
|
a['href'] = a['originalsrc']
|
|
|
|
del a['originalsrc']
|
|
|
|
return str(soup)
|
|
|
|
|
2019-08-15 10:07:05 +00:00
|
|
|
|
2019-08-10 10:58:04 +00:00
|
|
|
def main():
|
|
|
|
"""Main function of the safelink tool."""
|
|
|
|
parser = argparse.ArgumentParser()
|
2019-08-15 10:07:05 +00:00
|
|
|
parser.add_argument("--html",
|
|
|
|
help="HTML",
|
|
|
|
action="store_true",
|
|
|
|
default=False)
|
|
|
|
parser.add_argument("-i",
|
|
|
|
"--in-situ",
|
|
|
|
help="modify file",
|
|
|
|
action="store_true",
|
|
|
|
default=False)
|
|
|
|
parser.add_argument("-v",
|
|
|
|
"--verbose",
|
|
|
|
help="increase verbosity",
|
|
|
|
action="store_true")
|
2019-08-10 10:58:04 +00:00
|
|
|
parser.add_argument('file', nargs='?', default=None)
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
2019-08-15 10:07:05 +00:00
|
|
|
# parse command line
|
2019-08-10 10:58:04 +00:00
|
|
|
if args.file:
|
2019-08-12 23:50:12 +00:00
|
|
|
fhandle = open(args.file, mode="rb")
|
2019-08-10 10:58:04 +00:00
|
|
|
else:
|
2019-08-12 23:50:12 +00:00
|
|
|
fhandle = sys.stdin.buffer
|
2019-08-15 10:07:05 +00:00
|
|
|
|
2019-08-12 23:50:12 +00:00
|
|
|
data = fhandle.read()
|
2019-08-10 10:58:04 +00:00
|
|
|
|
2019-08-12 23:50:12 +00:00
|
|
|
if fhandle is not sys.stdin.buffer:
|
2019-08-10 10:58:04 +00:00
|
|
|
fhandle.close()
|
|
|
|
|
2019-08-15 10:07:05 +00:00
|
|
|
content = data.decode(encoding=chardet.detect(data)['encoding'],
|
|
|
|
errors="replace")
|
2019-08-12 23:50:12 +00:00
|
|
|
|
2019-08-10 10:58:04 +00:00
|
|
|
if args.html:
|
2019-08-15 10:07:05 +00:00
|
|
|
content = unsanitize_html(content)
|
2019-08-10 10:58:04 +00:00
|
|
|
else:
|
2019-08-15 10:07:05 +00:00
|
|
|
content = unsanitize_txt(content)
|
2019-08-10 10:58:04 +00:00
|
|
|
|
|
|
|
if args.file and args.in_situ:
|
|
|
|
st = os.stat(args.file)
|
|
|
|
os.chmod(args.file, st.st_mode | stat.S_IWRITE)
|
|
|
|
with open(args.file, "w+") as fhandle:
|
|
|
|
fhandle.write(content)
|
|
|
|
fhandle.truncate()
|
|
|
|
fhandle.close
|
|
|
|
else:
|
|
|
|
print(content)
|
|
|
|
|
2019-08-15 10:07:05 +00:00
|
|
|
|
2019-08-10 10:58:04 +00:00
|
|
|
if __name__ == "__main__":
|
2019-08-12 23:50:12 +00:00
|
|
|
main()
|