From: Georgios Kontaxis Date: Tue, 10 Nov 2015 18:42:11 +0000 (-0500) Subject: eTLDplusOne X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=86349ed5ca33d8cbb9d8b43b751cc20ac66d1ba2;p=eTLDplusOne.git eTLDplusOne --- diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c4f6061 --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +.PHONY: all clean + +all: db.sqlite3 + +public_suffix_list.dat: + bash get_list.sh + +db.sqlite3: public_suffix_list.dat + python makedb.py + +clean: + rm -f public_suffix_list.dat db.sqlite3 diff --git a/README.md b/README.md index 6697878..c909e2b 100644 --- a/README.md +++ b/README.md @@ -1 +1,15 @@ # eTLDplusOne + +``` +usage: eTLDplusOne.py [-h] [--verbose] D [D ...] + +Given a domain return its suffix comprised of the subdomain following its +effective top-level domain and the effective top-level domain itself. + +positional arguments: + D Domain to look up. + +optional arguments: + -h, --help show this help message and exit + --verbose, -v Output information on the process. +``` diff --git a/eTLDplusOne.py b/eTLDplusOne.py new file mode 100755 index 0000000..14925d6 --- /dev/null +++ b/eTLDplusOne.py @@ -0,0 +1,133 @@ +#!/usr/bin/python -u + +# kontaxis 2015-11-03 + +# Mozilla maintains a public list of DNS suffixes which are not under the +# control of individual registrants. The registered or registrable domain +# is the public suffix plus one additional label (eTLD+1). +# +# This program will take a hostname as input and return its eTLD+1 +# or the hostname itself. +# e.g., ./eTLDplusOne.py foo.example.com will return example.com +# e.g., ./eTLDplusOne.py example.com will return example.com +# e.g., ./eTLDplusOne.py example.invalid will return example.invalid +# +# Figuring out the eTLD+1 is useful when deciding the scope of a domain or +# grouping a set of domains under a common site. + +# References: +# - https://publicsuffix.org/list/ + +from __future__ import print_function + +import argparse +import os +import sys +import sqlite3 + +class eTLDplusOne: + verbose = False + + _dbConnCursor = None + + def __init__(self, dbPath): + conn = sqlite3.connect(dbpath) + conn.text_factory = str + self._dbConnCursor = conn.cursor() + + def calculate(self, domains): + eTLDplusOneDomains = [] + + for domain in domains: + # A domain or rule can be split into a list of labels using the + # separator "." (dot). The separator is not part of any of the labels. + # Empty labels are not permitted, meaning that leading and trailing + # dots are ignored. + labels = domain.strip(".").split(".") + + # If not eTLD is found in the database, + # make the current domain the eTLD+1. + eTLDplusOneDomain = domain + + # If a domain matches more than one rule in the file, the longest + # matching rule (the one with the most levels) will be used. + for i in range(1, len(labels)): + # A domain is said to match a rule if and only if all of the + # following conditions are met: + # When the domain and rule are split into corresponding labels, + # that the domain contains as many or more labels than the rule. + # Beginning with the right-most labels of both the domain and + # the rule, and continuing for all labels in the rule, one finds + # that for every pair, either they are identical, or that the label + # from the rule is "*". + # XXX We only check for wildcards in the left-most label. + eTLD = ".".join( labels[i:len(labels)]) + eTLD_wild = ".".join(["*"] + labels[i+1:len(labels)]) + if eTLD_wild == "*": + eTLD_wild = eTLD + + self.verbose and print("eTLD '%s' OR eTLD '%s' : " % ( + eTLD, eTLD_wild), end="") + + self._dbConnCursor.execute( + 'SELECT eTLD FROM eTLDs WHERE eTLD=? OR eTLD=?', (eTLD, eTLD_wild)) + match = self._dbConnCursor.fetchone() + if not match: + self.verbose and print("eTLD+1 NONE") + continue + + # eTLD has been found. + eTLDplusOneDomain = "%s" % ".".join(labels[i-1:len(labels)]) + + if self.verbose: + print("eTLD+1 '%s'" % eTLDplusOneDomain) + + break + + eTLDplusOneDomains.append(eTLDplusOneDomain) + + return eTLDplusOneDomains + + +if __name__ == "__main__": + + # Parse arguments. + parser = argparse.ArgumentParser(description= + "Given a domain return its suffix comprised of " + + "the subdomain following its effective top-level domain " + + "and the effective top-level domain itself.") + + parser.add_argument("--verbose", "-v", + action="store_const", const=True, default=False, + help = "Output information on the process.") + + parser.add_argument("domains", metavar="D", nargs="+", + help="Domain to look up.") + + args = parser.parse_args() + + # Make sure the SQLite3 database file exists in the same directory. + dirname = os.path.dirname(sys.argv[0]) + dbpath = os.path.join(dirname, "db.sqlite3") + + if not os.path.exists(dbpath): + print("ERROR. Path '%s' is unavailable." % dbpath, file=sys.stderr) + sys.exit(-1) + + if not os.path.isfile(dbpath): + print("ERROR. Path '%s' is not a file." % dbpath, file=sys.stderr) + sys.exit(-1) + + eTLDpOne = eTLDplusOne(dbpath) + eTLDpOne.verbose = args.verbose + + eTLDpOneDomains = eTLDpOne.calculate(args.domains) + for eTLDpOneDomain in eTLDpOneDomains: + print("%s" % eTLDpOneDomain) + + # Success + if eTLDpOneDomains: + sys.exit(0) + + # Failure + sys.exit(1) diff --git a/get_list.sh b/get_list.sh new file mode 100644 index 0000000..2a6134c --- /dev/null +++ b/get_list.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# https://publicsuffix.org/ +URL="https://publicsuffix.org/list/public_suffix_list.dat" + +curl -L "${URL}" -o "public_suffix_list.dat"; diff --git a/makedb.py b/makedb.py new file mode 100644 index 0000000..067cc5b --- /dev/null +++ b/makedb.py @@ -0,0 +1,67 @@ +#!/usr/bin/python -u + +# kontaxis 2015-11-03 + +# References: +# - https://publicsuffix.org/list/ + +from __future__ import print_function + +import os +import sqlite3 +import sys +import time + +dirname = os.path.dirname(sys.argv[0]) + +# Populate eTLDs records array +eTLDs = [] + +f = file(os.path.join(dirname, "public_suffix_list.dat"), "r") + +# The list is a set of rules, with one rule per line. +for line in f: + # The Public Suffix List consists of a series of lines, separated by \n. + line = line.rstrip("\n") + # Each line is only read up to the first whitespace; + line = line.split(" ")[0] + if line == "": + continue + # entire lines can also be commented using //. + if len(line) > 1 and line[0:2] == "//": + continue + # Each line which is not entirely whitespace or + # begins with a comment contains a rule. + rule = line + # A rule may begin with a "!" (exclamation mark). If it does, it is labelled + # as a "exception rule" and then treated as if the exclamation mark is not + # present. + if rule[0] == "!": + rule = rule[1:] + eTLDs.append((rule,)) + +f.close() + +# Make it happen +conn = sqlite3.connect("db.sqlite3") +conn.text_factory = str +c = conn.cursor() + +# Create schema. +c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", + ("last_generated",)) +match = c.fetchone() +if not match: + c.execute("CREATE TABLE last_generated (epoch integer);") + c.execute("CREATE TABLE eTLDs (eTLD text);") + c.execute("CREATE INDEX eTLD on eTLDs (eTLD);") + +c.execute('DELETE FROM last_generated'); +c.execute('INSERT INTO last_generated VALUES(?)', + (str(int(time.time())),)) + +c.execute('DELETE FROM eTLDs'); +c.executemany('INSERT INTO eTLDs VALUES (?)', eTLDs) + +conn.commit() +conn.close()