From: funilrys <redacted>
Date: Wed, 28 Feb 2018 22:06:58 +0000 (+0100)
Subject: Review of get_file_by_url()
X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=c82f69195282bda1e5c2a6d756bebf42ce92ac9d;p=stevenblack-hosts.git

Review of get_file_by_url()

Please note that this patch also introduce
which is in charge of converting a domain in a line into
IDNA and/or UTF-8 format.

Also note the introduction of BeautifulSoup() which helps
us to decode data from the downloaded URL.

Fixes (issue(s)/protocol(s) I was able to reproduce):
 * https://github.com/StevenBlack/hosts/issues/514#issuecomment-368932152

Possible fix of (issue(s)/protocol(s) I wasn't able to reproduce):
 * https://github.com/StevenBlack/hosts/issues/514#issue-300048106
 * https://github.com/StevenBlack/hosts/issues/494#issue-296166492
 * https://github.com/StevenBlack/hosts/issues/420#issue-267453114
 * https://github.com/StevenBlack/hosts/issues/372#issue-246927047
 * https://github.com/StevenBlack/hosts/issues/382#issuecomment-322010562
---

diff --git a/updateHostsFile.py b/updateHostsFile.py
index ad77cc823..d27850ef8 100644
--- a/updateHostsFile.py
+++ b/updateHostsFile.py
@@ -6,23 +6,26 @@
 # This Python script will combine all the host files you provide
 # as sources into one, unique host file to keep you internet browsing happy.
 
-from __future__ import (absolute_import, division,
-                        print_function, unicode_literals)
-from glob import glob
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
 
-import os
+import argparse
+import fnmatch
+import json
 import locale
+import os
 import platform
 import re
 import shutil
+import socket
 import subprocess
 import sys
 import tempfile
 import time
-import fnmatch
-import argparse
-import socket
-import json
+from glob import glob
+
+import lxml
+from bs4 import BeautifulSoup
 
 # Detecting Python 3 for version-dependent implementations
 PY3 = sys.version_info >= (3, 0)
@@ -1125,6 +1128,62 @@ def remove_old_hosts_file(backup):
     open(old_file_path, "a").close()
 # End File Logic
 
+def domain_to_idna(line):
+    """
+    Encode a domain which is presente into a line into `idna`. This way we avoid
+    the most encoding issue case.
+
+    Parameters
+    ----------
+    line : str
+        The line we have to encode/decode.
+
+    Returns
+    -------
+    line : str
+        The line in a converted format.
+
+    Notes
+    -----
+    - This method/function encode only the domain to `idna` format because in
+        most cases the encoding issue is due to a domain which looks like
+        `b'\xc9\xa2oogle.com'.decode('idna')`.
+    - About the splitting:
+        We split because we only want to encode the domain and not the full line
+            which may cause some issue. Keep in mind that we split but we still
+            concatenate once we encoded the domain.
+
+        - The following split the prefix `0.0.0.0` or `127.0.0.1` of a line.
+        - The following also split the trailing comment of a given line.
+    - You do not get it ?
+        - Run https://git.io/vA1Rj and enjoy the view :-).
+    """
+
+    if not line.startswith('#'):
+        for separator in [' ', '\t']:
+            comment_to_append = ''
+
+            if separator in line:
+                splited_line = line.split(separator)
+                if '#' in splited_line[1]:
+                    comment_to_append = splited_line[1].split('#')[1]
+
+                    if comment_to_append:
+                        splited_line[1] = splited_line[1] \
+                            .split(comment_to_append)[0] \
+                            .encode("IDNA").decode("UTF-8") + \
+                                '#' + comment_to_append[1]
+                    else:
+                        splited_line[1] = splited_line[1] \
+                            .encode("IDNA") \
+                            .decode("UTF-8") + '#'
+                else:
+                    splited_line[1] = splited_line[1] \
+                        .encode("IDNA") \
+                        .decode("UTF-8")
+                return separator.join(splited_line)
+        return line.encode("IDNA").decode("UTF-8")
+    return line.encode("UTF-8").decode("UTF-8")
 
 # Helper Functions
 def get_file_by_url(url):
@@ -1141,11 +1200,17 @@ def get_file_by_url(url):
     url_data : str or None
         The data retrieved at that URL from the file. Returns None if the
         attempted retrieval is unsuccessful.
+
+    Note
+    ----
+    - BeautifulSoup is used in this case to avoid having to search in which
+        format we have to encode or decode data before parsing it to UTF-8.
     """
 
     try:
         f = urlopen(url)
-        return f.read().decode("UTF-8")
+        soup = BeautifulSoup(f.read(),'lxml').get_text()
+        return '\n'.join(list(map(domain_to_idna, soup.split('\n'))))
     except Exception:
         print("Problem getting file: ", url)
 
@@ -1165,7 +1230,10 @@ def write_data(f, data):
     if PY3:
         f.write(bytes(data, "UTF-8"))
     else:
-        f.write(str(data).encode("UTF-8"))
+        try:
+            f.write(str(data))
+        except UnicodeEncodeError:
+            f.write(str(data.encode("UTF-8")))
 
 
 def list_dir_no_hidden(path):