--- /dev/null
+#!/usr/bin/python -u
+
+# kontaxis 2015-11-03
+
+# Mozilla maintains a public list of DNS suffixes which are not under the
+# control of individual registrants. The registered or registrable domain
+# is the public suffix plus one additional label (eTLD+1).
+#
+# This program will take a hostname as input and return its eTLD+1
+# or the hostname itself.
+# e.g., ./eTLDplusOne.py foo.example.com will return example.com
+# e.g., ./eTLDplusOne.py example.com will return example.com
+# e.g., ./eTLDplusOne.py example.invalid will return example.invalid
+#
+# Figuring out the eTLD+1 is useful when deciding the scope of a domain or
+# grouping a set of domains under a common site.
+
+# References:
+# - https://publicsuffix.org/list/
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+import sqlite3
+
+class eTLDplusOne:
+ verbose = False
+
+ _dbConnCursor = None
+
+ def __init__(self, dbPath):
+ conn = sqlite3.connect(dbpath)
+ conn.text_factory = str
+ self._dbConnCursor = conn.cursor()
+
+ def calculate(self, domains):
+ eTLDplusOneDomains = []
+
+ for domain in domains:
+ # A domain or rule can be split into a list of labels using the
+ # separator "." (dot). The separator is not part of any of the labels.
+ # Empty labels are not permitted, meaning that leading and trailing
+ # dots are ignored.
+ labels = domain.strip(".").split(".")
+
+ # If not eTLD is found in the database,
+ # make the current domain the eTLD+1.
+ eTLDplusOneDomain = domain
+
+ # If a domain matches more than one rule in the file, the longest
+ # matching rule (the one with the most levels) will be used.
+ for i in range(1, len(labels)):
+ # A domain is said to match a rule if and only if all of the
+ # following conditions are met:
+ # When the domain and rule are split into corresponding labels,
+ # that the domain contains as many or more labels than the rule.
+ # Beginning with the right-most labels of both the domain and
+ # the rule, and continuing for all labels in the rule, one finds
+ # that for every pair, either they are identical, or that the label
+ # from the rule is "*".
+ # XXX We only check for wildcards in the left-most label.
+ eTLD = ".".join( labels[i:len(labels)])
+ eTLD_wild = ".".join(["*"] + labels[i+1:len(labels)])
+ if eTLD_wild == "*":
+ eTLD_wild = eTLD
+
+ self.verbose and print("eTLD '%s' OR eTLD '%s' : " % (
+ eTLD, eTLD_wild), end="")
+
+ self._dbConnCursor.execute(
+ 'SELECT eTLD FROM eTLDs WHERE eTLD=? OR eTLD=?', (eTLD, eTLD_wild))
+ match = self._dbConnCursor.fetchone()
+ if not match:
+ self.verbose and print("eTLD+1 NONE")
+ continue
+
+ # eTLD has been found.
+ eTLDplusOneDomain = "%s" % ".".join(labels[i-1:len(labels)])
+
+ if self.verbose:
+ print("eTLD+1 '%s'" % eTLDplusOneDomain)
+
+ break
+
+ eTLDplusOneDomains.append(eTLDplusOneDomain)
+
+ return eTLDplusOneDomains
+
+
+if __name__ == "__main__":
+
+ # Parse arguments.
+ parser = argparse.ArgumentParser(description=
+ "Given a domain return its suffix comprised of " +
+ "the subdomain following its effective top-level domain " +
+ "and the effective top-level domain itself.")
+
+ parser.add_argument("--verbose", "-v",
+ action="store_const", const=True, default=False,
+ help = "Output information on the process.")
+
+ parser.add_argument("domains", metavar="D", nargs="+",
+ help="Domain to look up.")
+
+ args = parser.parse_args()
+
+ # Make sure the SQLite3 database file exists in the same directory.
+ dirname = os.path.dirname(sys.argv[0])
+ dbpath = os.path.join(dirname, "db.sqlite3")
+
+ if not os.path.exists(dbpath):
+ print("ERROR. Path '%s' is unavailable." % dbpath, file=sys.stderr)
+ sys.exit(-1)
+
+ if not os.path.isfile(dbpath):
+ print("ERROR. Path '%s' is not a file." % dbpath, file=sys.stderr)
+ sys.exit(-1)
+
+ eTLDpOne = eTLDplusOne(dbpath)
+ eTLDpOne.verbose = args.verbose
+
+ eTLDpOneDomains = eTLDpOne.calculate(args.domains)
+ for eTLDpOneDomain in eTLDpOneDomains:
+ print("%s" % eTLDpOneDomain)
+
+ # Success
+ if eTLDpOneDomains:
+ sys.exit(0)
+
+ # Failure
+ sys.exit(1)
--- /dev/null
+#!/usr/bin/python -u
+
+# kontaxis 2015-11-03
+
+# References:
+# - https://publicsuffix.org/list/
+
+from __future__ import print_function
+
+import os
+import sqlite3
+import sys
+import time
+
+dirname = os.path.dirname(sys.argv[0])
+
+# Populate eTLDs records array
+eTLDs = []
+
+f = file(os.path.join(dirname, "public_suffix_list.dat"), "r")
+
+# The list is a set of rules, with one rule per line.
+for line in f:
+ # The Public Suffix List consists of a series of lines, separated by \n.
+ line = line.rstrip("\n")
+ # Each line is only read up to the first whitespace;
+ line = line.split(" ")[0]
+ if line == "":
+ continue
+ # entire lines can also be commented using //.
+ if len(line) > 1 and line[0:2] == "//":
+ continue
+ # Each line which is not entirely whitespace or
+ # begins with a comment contains a rule.
+ rule = line
+ # A rule may begin with a "!" (exclamation mark). If it does, it is labelled
+ # as a "exception rule" and then treated as if the exclamation mark is not
+ # present.
+ if rule[0] == "!":
+ rule = rule[1:]
+ eTLDs.append((rule,))
+
+f.close()
+
+# Make it happen
+conn = sqlite3.connect("db.sqlite3")
+conn.text_factory = str
+c = conn.cursor()
+
+# Create schema.
+c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?",
+ ("last_generated",))
+match = c.fetchone()
+if not match:
+ c.execute("CREATE TABLE last_generated (epoch integer);")
+ c.execute("CREATE TABLE eTLDs (eTLD text);")
+ c.execute("CREATE INDEX eTLD on eTLDs (eTLD);")
+
+c.execute('DELETE FROM last_generated');
+c.execute('INSERT INTO last_generated VALUES(?)',
+ (str(int(time.time())),))
+
+c.execute('DELETE FROM eTLDs');
+c.executemany('INSERT INTO eTLDs VALUES (?)', eTLDs)
+
+conn.commit()
+conn.close()