From: funilrys Date: Wed, 28 Feb 2018 22:06:58 +0000 (+0100) Subject: Review of get_file_by_url() X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=c82f69195282bda1e5c2a6d756bebf42ce92ac9d;p=stevenblack-hosts.git Review of get_file_by_url() Please note that this patch also introduce which is in charge of converting a domain in a line into IDNA and/or UTF-8 format. Also note the introduction of BeautifulSoup() which helps us to decode data from the downloaded URL. Fixes (issue(s)/protocol(s) I was able to reproduce): * https://github.com/StevenBlack/hosts/issues/514#issuecomment-368932152 Possible fix of (issue(s)/protocol(s) I wasn't able to reproduce): * https://github.com/StevenBlack/hosts/issues/514#issue-300048106 * https://github.com/StevenBlack/hosts/issues/494#issue-296166492 * https://github.com/StevenBlack/hosts/issues/420#issue-267453114 * https://github.com/StevenBlack/hosts/issues/372#issue-246927047 * https://github.com/StevenBlack/hosts/issues/382#issuecomment-322010562 --- diff --git a/updateHostsFile.py b/updateHostsFile.py index ad77cc823..d27850ef8 100644 --- a/updateHostsFile.py +++ b/updateHostsFile.py @@ -6,23 +6,26 @@ # This Python script will combine all the host files you provide # as sources into one, unique host file to keep you internet browsing happy. -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from glob import glob +from __future__ import (absolute_import, division, print_function, + unicode_literals) -import os +import argparse +import fnmatch +import json import locale +import os import platform import re import shutil +import socket import subprocess import sys import tempfile import time -import fnmatch -import argparse -import socket -import json +from glob import glob + +import lxml +from bs4 import BeautifulSoup # Detecting Python 3 for version-dependent implementations PY3 = sys.version_info >= (3, 0) @@ -1125,6 +1128,62 @@ def remove_old_hosts_file(backup): open(old_file_path, "a").close() # End File Logic +def domain_to_idna(line): + """ + Encode a domain which is presente into a line into `idna`. This way we avoid + the most encoding issue case. + + Parameters + ---------- + line : str + The line we have to encode/decode. + + Returns + ------- + line : str + The line in a converted format. + + Notes + ----- + - This method/function encode only the domain to `idna` format because in + most cases the encoding issue is due to a domain which looks like + `b'\xc9\xa2oogle.com'.decode('idna')`. + - About the splitting: + We split because we only want to encode the domain and not the full line + which may cause some issue. Keep in mind that we split but we still + concatenate once we encoded the domain. + + - The following split the prefix `0.0.0.0` or `127.0.0.1` of a line. + - The following also split the trailing comment of a given line. + - You do not get it ? + - Run https://git.io/vA1Rj and enjoy the view :-). + """ + + if not line.startswith('#'): + for separator in [' ', '\t']: + comment_to_append = '' + + if separator in line: + splited_line = line.split(separator) + if '#' in splited_line[1]: + comment_to_append = splited_line[1].split('#')[1] + + if comment_to_append: + splited_line[1] = splited_line[1] \ + .split(comment_to_append)[0] \ + .encode("IDNA").decode("UTF-8") + \ + '#' + comment_to_append[1] + else: + splited_line[1] = splited_line[1] \ + .encode("IDNA") \ + .decode("UTF-8") + '#' + else: + splited_line[1] = splited_line[1] \ + .encode("IDNA") \ + .decode("UTF-8") + return separator.join(splited_line) + return line.encode("IDNA").decode("UTF-8") + return line.encode("UTF-8").decode("UTF-8") # Helper Functions def get_file_by_url(url): @@ -1141,11 +1200,17 @@ def get_file_by_url(url): url_data : str or None The data retrieved at that URL from the file. Returns None if the attempted retrieval is unsuccessful. + + Note + ---- + - BeautifulSoup is used in this case to avoid having to search in which + format we have to encode or decode data before parsing it to UTF-8. """ try: f = urlopen(url) - return f.read().decode("UTF-8") + soup = BeautifulSoup(f.read(),'lxml').get_text() + return '\n'.join(list(map(domain_to_idna, soup.split('\n')))) except Exception: print("Problem getting file: ", url) @@ -1165,7 +1230,10 @@ def write_data(f, data): if PY3: f.write(bytes(data, "UTF-8")) else: - f.write(str(data).encode("UTF-8")) + try: + f.write(str(data)) + except UnicodeEncodeError: + f.write(str(data.encode("UTF-8"))) def list_dir_no_hidden(path):