From: Georgios Kontaxis <redacted>
Date: Tue, 10 Nov 2015 18:42:11 +0000 (-0500)
Subject: eTLDplusOne
X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=86349ed5ca33d8cbb9d8b43b751cc20ac66d1ba2;p=eTLDplusOne.git

eTLDplusOne
---

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..c4f6061
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,12 @@
+.PHONY: all clean
+
+all: db.sqlite3
+
+public_suffix_list.dat:
+	bash get_list.sh
+
+db.sqlite3: public_suffix_list.dat
+	python makedb.py
+
+clean:
+	rm -f public_suffix_list.dat db.sqlite3
diff --git a/README.md b/README.md
index 6697878..c909e2b 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,15 @@
 # eTLDplusOne
+
+```
+usage: eTLDplusOne.py [-h] [--verbose] D [D ...]
+
+Given a domain return its suffix comprised of the subdomain following its
+effective top-level domain and the effective top-level domain itself.
+
+positional arguments:
+  D              Domain to look up.
+
+optional arguments:
+  -h, --help     show this help message and exit
+  --verbose, -v  Output information on the process.
+```
diff --git a/eTLDplusOne.py b/eTLDplusOne.py
new file mode 100755
index 0000000..14925d6
--- /dev/null
+++ b/eTLDplusOne.py
@@ -0,0 +1,133 @@
+#!/usr/bin/python -u
+
+# kontaxis 2015-11-03
+
+# Mozilla maintains a public list of DNS suffixes which are not under the
+# control of individual registrants. The registered or registrable domain
+# is the public suffix plus one additional label (eTLD+1).
+#
+# This program will take a hostname as input and return its eTLD+1
+# or the hostname itself.
+# e.g., ./eTLDplusOne.py foo.example.com will return example.com
+# e.g., ./eTLDplusOne.py example.com     will return example.com
+# e.g., ./eTLDplusOne.py example.invalid will return example.invalid
+#
+# Figuring out the eTLD+1 is useful when deciding the scope of a domain or
+# grouping a set of domains under a common site.
+
+# References:
+# - https://publicsuffix.org/list/
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+import sqlite3
+
+class eTLDplusOne:
+	verbose = False
+
+	_dbConnCursor = None
+
+	def __init__(self, dbPath):
+		conn = sqlite3.connect(dbpath)
+		conn.text_factory = str
+		self._dbConnCursor = conn.cursor()
+
+	def calculate(self, domains):
+		eTLDplusOneDomains = []
+
+		for domain in domains:
+			# A domain or rule can be split into a list of labels using the
+			# separator "." (dot). The separator is not part of any of the labels.
+			# Empty labels are not permitted, meaning that leading and trailing
+			# dots are ignored.
+			labels = domain.strip(".").split(".")
+
+			# If not eTLD is found in the database,
+			# make the current domain the eTLD+1.
+			eTLDplusOneDomain = domain
+
+			# If a domain matches more than one rule in the file, the longest
+			# matching rule (the one with the most levels) will be used.
+			for i in range(1, len(labels)):
+				# A domain is said to match a rule if and only if all of the
+				# following conditions are met:
+				# When the domain and rule are split into corresponding labels,
+				# that the domain contains as many or more labels than the rule.
+				# Beginning with the right-most labels of both the domain and
+				# the rule, and continuing for all labels in the rule, one finds
+				# that for every pair, either they are identical, or that the label
+				# from the rule is "*".
+				# XXX We only check for wildcards in the left-most label.
+				eTLD      = ".".join(        labels[i:len(labels)])
+				eTLD_wild = ".".join(["*"] + labels[i+1:len(labels)])
+				if eTLD_wild == "*":
+					eTLD_wild = eTLD
+
+				self.verbose and print("eTLD '%s' OR eTLD '%s' : " % (
+					eTLD, eTLD_wild), end="")
+
+				self._dbConnCursor.execute(
+					'SELECT eTLD FROM eTLDs WHERE eTLD=? OR eTLD=?', (eTLD, eTLD_wild))
+				match = self._dbConnCursor.fetchone()
+				if not match:
+					self.verbose and print("eTLD+1 NONE")
+					continue
+
+				# eTLD has been found.
+				eTLDplusOneDomain = "%s" % ".".join(labels[i-1:len(labels)])
+
+				if self.verbose:
+					print("eTLD+1 '%s'" % eTLDplusOneDomain)
+
+				break
+
+			eTLDplusOneDomains.append(eTLDplusOneDomain)
+
+		return eTLDplusOneDomains
+
+
+if __name__ == "__main__":
+
+	# Parse arguments.
+	parser = argparse.ArgumentParser(description=
+		"Given a domain return its suffix comprised of " +
+		"the subdomain following its effective top-level domain " +
+		"and the effective top-level domain itself.")
+
+	parser.add_argument("--verbose", "-v",
+		action="store_const", const=True, default=False,
+		help = "Output information on the process.")
+
+	parser.add_argument("domains", metavar="D", nargs="+",
+		help="Domain to look up.")
+
+	args = parser.parse_args()
+
+	# Make sure the SQLite3 database file exists in the same directory.
+	dirname = os.path.dirname(sys.argv[0])
+	dbpath  = os.path.join(dirname, "db.sqlite3")
+
+	if not os.path.exists(dbpath):
+		print("ERROR. Path '%s' is unavailable." % dbpath, file=sys.stderr)
+		sys.exit(-1)
+
+	if not os.path.isfile(dbpath):
+		print("ERROR. Path '%s' is not a file."  % dbpath, file=sys.stderr)
+		sys.exit(-1)
+
+	eTLDpOne = eTLDplusOne(dbpath)
+	eTLDpOne.verbose = args.verbose
+
+	eTLDpOneDomains = eTLDpOne.calculate(args.domains)
+	for eTLDpOneDomain in eTLDpOneDomains:
+		print("%s" % eTLDpOneDomain)
+
+	# Success
+	if eTLDpOneDomains:
+		sys.exit(0)
+
+	# Failure
+	sys.exit(1)
diff --git a/get_list.sh b/get_list.sh
new file mode 100644
index 0000000..2a6134c
--- /dev/null
+++ b/get_list.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+# https://publicsuffix.org/
+URL="https://publicsuffix.org/list/public_suffix_list.dat"
+
+curl -L "${URL}" -o "public_suffix_list.dat";
diff --git a/makedb.py b/makedb.py
new file mode 100644
index 0000000..067cc5b
--- /dev/null
+++ b/makedb.py
@@ -0,0 +1,67 @@
+#!/usr/bin/python -u
+
+# kontaxis 2015-11-03
+
+# References:
+# - https://publicsuffix.org/list/
+
+from __future__ import print_function
+
+import os
+import sqlite3
+import sys
+import time
+
+dirname = os.path.dirname(sys.argv[0])
+
+# Populate eTLDs records array
+eTLDs = []
+
+f = file(os.path.join(dirname, "public_suffix_list.dat"), "r")
+
+# The list is a set of rules, with one rule per line.
+for line in f:
+	# The Public Suffix List consists of a series of lines, separated by \n.
+	line = line.rstrip("\n")
+	# Each line is only read up to the first whitespace;
+	line = line.split(" ")[0]
+	if line == "":
+		continue
+	# entire lines can also be commented using //.
+	if len(line) > 1 and line[0:2] == "//":
+		continue
+	# Each line which is not entirely whitespace or
+	# begins with a comment contains a rule.
+	rule = line
+	# A rule may begin with a "!" (exclamation mark). If it does, it is labelled
+	# as a "exception rule" and then treated as if the exclamation mark is not
+	# present.
+	if rule[0] == "!":
+		rule = rule[1:]
+	eTLDs.append((rule,))
+
+f.close()
+
+# Make it happen
+conn = sqlite3.connect("db.sqlite3")
+conn.text_factory = str
+c = conn.cursor()
+
+# Create schema.
+c.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?",
+	("last_generated",))
+match = c.fetchone()
+if not match:
+	c.execute("CREATE TABLE last_generated (epoch integer);")
+	c.execute("CREATE TABLE eTLDs (eTLD text);")
+	c.execute("CREATE INDEX eTLD on eTLDs (eTLD);")
+
+c.execute('DELETE FROM last_generated');
+c.execute('INSERT INTO last_generated VALUES(?)',
+	(str(int(time.time())),))
+
+c.execute('DELETE FROM eTLDs');
+c.executemany('INSERT INTO eTLDs VALUES (?)', eTLDs)
+
+conn.commit()
+conn.close()