--- /dev/null
+#!/usr/bin/python
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+import sqlite3
+
+import math
+
+def entropy(input, verbose):
+ # Dictionary of dictionaries used to store the frequency of each
+ # symbol at each offset from the beginning of an input token.
+ frequency = {}
+
+ line_count = 0
+
+ # Count the frequency of each character at each offset within a token.
+ for line in input:
+ line = line.strip()
+ verbose and print("< '%s'" % line, file=sys.stderr)
+ line_count += 1
+
+ for index in range(0, len(line)):
+ symbol = line[index]
+ if (not index in frequency):
+ frequency[index] = {}
+ if (not symbol in frequency[index]):
+ frequency[index][symbol] = 1
+ else:
+ frequency[index][symbol] += 1
+
+ # Some lines might be shorter than others. We assume that shorter lines
+ # are padded to the length of the longest line using symbol None. This
+ # makes sure that the sum of the symbol frequencies at each index equals
+ # the number of entries we have processed. In terms of entropy calculation
+ # this factors in the length of the entries.
+ for index in sorted(frequency.keys(), reverse=True):
+ frequency_sum = line_count;
+
+ for char in frequency[index].keys():
+ frequency_sum -= frequency[index][char]
+ assert (frequency_sum >= 0),\
+ "frequency_sum[%d] %d < 0" % (index, frequency_sum)
+
+ if (frequency_sum == 0):
+ break
+
+ frequency[index][None] = frequency_sum
+
+ # Dictionary used to store the sum of symbol frequencies
+ # at each offset from the beginning of a token.
+ # Should be equal to number of tokens (see above).
+ frequency_sums = {}
+
+ # Dictionary of dictionaries used to store the probability
+ # of each symbol at each offset from the beginning of a token.
+ probability = {}
+
+ for index in frequency.keys():
+ # Assertion: sum up the frequencies of all symbols at each index.
+ # They must be equal to the number of entries processed, i.e., every
+ # entry must have at least one symbol (even None, see above) at every
+ # index.
+ if (not index in frequency_sums):
+ frequency_sums[index] = 0
+ for symbol in frequency[index].keys():
+ frequency_sums[index] += frequency[index][symbol]
+ assert (frequency_sums[index] == line_count)
+ #
+ for symbol in frequency[index].keys():
+ if (not index in probability):
+ probability[index] = {}
+ probability[index][symbol] =\
+ float(frequency[index][symbol]) / frequency_sums[index]
+
+ # Assertion: individual probabilities at each index must sum up to 1
+ for index in frequency.keys():
+ probability_sum = 0
+ for symbol in frequency[index].keys():
+ probability_sum += probability[index][symbol]
+ # 0.999 and above get rounded up to 1.0
+ assert (int(probability_sum+0.001) == 1),\
+ "probability_sum[%d] (%.100f) %d != 1" %\
+ (index, probability_sum, int(probability_sum+0.001))
+
+ # Shannon Entropy
+ for line in input:
+ line = line.strip()
+
+ # Shannon Entropy
+ entropy = 0
+ for index in range(0, len(line)):
+ symbol = line[index];
+
+ entropy += probability[index][symbol] *\
+ math.log(probability[index][symbol], 2);
+
+ verbose and print("> '%s' [%d] P(%s)=%f F(%s)=%d/%d E[0:%d]=%f" %
+ (line, index, symbol, probability[index][symbol],
+ symbol, frequency[index][symbol], frequency_sums[index],
+ index + 1, 0 - entropy), file=sys.stderr)
+
+ entropy = 0 - entropy
+
+ print("%s\t%f" % (line, entropy))
+
+
+# Input file is expected to be either:
+# a) a set of entries on individual lines
+# b) a cookies.sqlite database maintained by Firefox (moz_cookies table)
+#
+# Output is an equivalent number of lines with tab-separated values
+# where the left value is the original entry and the right value is
+# the calculated entropy in bits.
+
+def main(argv):
+
+ parser = argparse.ArgumentParser(description=
+ "Calculate entropy given a set of values" +
+ "or a cookies.sqlite Firefox database.")
+
+ parser.add_argument("--verbose", "-v",
+ action="store_const", const=True, default=False,
+ help="output details on how entropy is calculated")
+
+ parser.add_argument("input", nargs=1,
+ help="name of the input file (e.g., cookie_values.txt or cookies.sqlite");
+
+ args = parser.parse_args()
+
+ root, ext = os.path.splitext(args.input[0])
+ if (ext == ".sqlite"):
+ conn = sqlite3.connect(args.input[0]) or die()
+ c = conn.cursor()
+ c.execute("SELECT value FROM moz_cookies")
+ entropy(
+ map(lambda x: x[0].encode('utf-8', errors='strict') ,c.fetchall()),
+ args.verbose)
+ conn.close()
+ else:
+ args.verbose and print("# NOTICE: Treating '%s' as a text file" %
+ (args.input[0]), file=sys.stderr)
+ file = open(args.input[0], "r") or die()
+ entropy(file.readlines(), args.verbose)
+ file.close()
+
+
+if __name__ == "__main__":
+ main(sys.argv)