From: Georgios Kontaxis <redacted>
Date: Sat, 21 Mar 2015 20:32:30 +0000 (-0400)
Subject: Shannon entropy
X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=9f71a6ad6180d5c93573bc6b6c08daf8990832db;p=cookieanalysis.git

Shannon entropy
---

diff --git a/entropy.py b/entropy.py
new file mode 100755
index 0000000..d6cac94
--- /dev/null
+++ b/entropy.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+import sqlite3
+
+import math
+
+def entropy(input, verbose):
+	# Dictionary of dictionaries used to store the frequency of each
+	# symbol at each offset from the beginning of an input token.
+	frequency = {}
+
+	line_count = 0
+
+	# Count the frequency of each character at each offset within a token.
+	for line in input:
+		line = line.strip()
+		verbose and print("< '%s'" % line, file=sys.stderr)
+		line_count += 1
+
+		for index in range(0, len(line)):
+			symbol = line[index]
+			if (not index in frequency):
+				frequency[index] = {}
+			if (not symbol in frequency[index]):
+				frequency[index][symbol]  = 1
+			else:
+				frequency[index][symbol] += 1
+
+	# Some lines might be shorter than others. We assume that shorter lines
+	# are padded to the length of the longest line using symbol None. This
+	# makes sure that the sum of the symbol frequencies at each index equals
+	# the number of entries we have processed. In terms of entropy calculation
+	# this factors in the length of the entries.
+	for index in sorted(frequency.keys(), reverse=True):
+		frequency_sum = line_count;
+
+		for char in frequency[index].keys():
+			frequency_sum -= frequency[index][char]
+		assert (frequency_sum >= 0),\
+			"frequency_sum[%d] %d < 0" % (index, frequency_sum)
+
+		if (frequency_sum == 0):
+			break
+
+		frequency[index][None] = frequency_sum
+
+	# Dictionary used to store the sum of symbol frequencies
+	# at each offset from the beginning of a token.
+	# Should be equal to number of tokens (see above).
+	frequency_sums = {}
+
+	# Dictionary of dictionaries used to store the probability
+	# of each symbol at each offset from the beginning of a token.
+	probability = {}
+
+	for index in frequency.keys():
+		# Assertion: sum up the frequencies of all symbols at each index.
+		# They must be equal to the number of entries processed, i.e., every
+		# entry must have at least one symbol (even None, see above) at every
+		# index.
+		if (not index in frequency_sums):
+			frequency_sums[index] = 0
+			for symbol in frequency[index].keys():
+				frequency_sums[index] += frequency[index][symbol]
+			assert (frequency_sums[index] == line_count)
+		#
+		for symbol in frequency[index].keys():
+			if (not index in probability):
+				probability[index] = {}
+			probability[index][symbol] =\
+				float(frequency[index][symbol]) / frequency_sums[index]
+
+	# Assertion: individual probabilities at each index must sum up to 1
+	for index in frequency.keys():
+		probability_sum = 0
+		for symbol in frequency[index].keys():
+			probability_sum += probability[index][symbol]
+		# 0.999 and above get rounded up to 1.0
+		assert (int(probability_sum+0.001) == 1),\
+			"probability_sum[%d] (%.100f) %d != 1" %\
+				(index, probability_sum, int(probability_sum+0.001))
+
+	# Shannon Entropy
+	for line in input:
+		line = line.strip()
+
+		# Shannon Entropy
+		entropy = 0
+		for index in range(0, len(line)):
+			symbol = line[index];
+
+			entropy += probability[index][symbol] *\
+				math.log(probability[index][symbol], 2);
+
+			verbose and print("> '%s' [%d] P(%s)=%f F(%s)=%d/%d E[0:%d]=%f" %
+				(line, index, symbol, probability[index][symbol],
+				symbol, frequency[index][symbol], frequency_sums[index],
+				index + 1, 0 - entropy), file=sys.stderr)
+
+		entropy = 0 - entropy
+
+		print("%s\t%f" % (line, entropy))
+
+
+# Input file is expected to be either:
+# a) a set of entries on individual lines
+# b) a cookies.sqlite database maintained by Firefox (moz_cookies table)
+#
+# Output is an equivalent number of lines with tab-separated values
+# where the left value is the original entry and the right value is
+# the calculated entropy in bits.
+
+def main(argv):
+
+	parser = argparse.ArgumentParser(description=
+		"Calculate entropy given a set of values" +
+			"or a cookies.sqlite Firefox database.")
+
+	parser.add_argument("--verbose", "-v",
+		action="store_const", const=True, default=False,
+		help="output details on how entropy is calculated")
+
+	parser.add_argument("input", nargs=1,
+		help="name of the input file (e.g., cookie_values.txt or cookies.sqlite");
+
+	args = parser.parse_args()
+
+	root, ext = os.path.splitext(args.input[0])
+	if (ext == ".sqlite"):
+		conn = sqlite3.connect(args.input[0]) or die()
+		c = conn.cursor()
+		c.execute("SELECT value FROM moz_cookies")
+		entropy(
+			map(lambda x: x[0].encode('utf-8', errors='strict') ,c.fetchall()),
+			args.verbose)
+		conn.close()
+	else:
+		args.verbose and print("# NOTICE: Treating '%s' as a text file" %
+				(args.input[0]), file=sys.stderr)
+		file = open(args.input[0], "r") or die()
+		entropy(file.readlines(), args.verbose)
+		file.close()
+
+
+if __name__ == "__main__":
+	main(sys.argv)