Shannon entropy

author Georgios Kontaxis <redacted>

Sat, 21 Mar 2015 20:32:30 +0000 (16:32 -0400)

committer Georgios Kontaxis <redacted>

Sat, 21 Mar 2015 20:32:30 +0000 (16:32 -0400)
author Georgios Kontaxis <redacted>
Sat, 21 Mar 2015 20:32:30 +0000 (16:32 -0400)
committer Georgios Kontaxis <redacted>
Sat, 21 Mar 2015 20:32:30 +0000 (16:32 -0400)
diff --git a/entropy.py b/entropy.py

new file mode 100755 (executable)

index 0000000..d6cac94
--- /dev/null
+++ b/entropy.py
@@ -0,0 +1,152 @@
+#!/usr/bin/python
+
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+import sqlite3
+
+import math
+
+def entropy(input, verbose):
+       # Dictionary of dictionaries used to store the frequency of each
+       # symbol at each offset from the beginning of an input token.
+       frequency = {}
+
+       line_count = 0
+
+       # Count the frequency of each character at each offset within a token.
+       for line in input:
+               line = line.strip()
+               verbose and print("< '%s'" % line, file=sys.stderr)
+               line_count += 1
+
+               for index in range(0, len(line)):
+                       symbol = line[index]
+                       if (not index in frequency):
+                               frequency[index] = {}
+                       if (not symbol in frequency[index]):
+                               frequency[index][symbol]  = 1
+                       else:
+                               frequency[index][symbol] += 1
+
+       # Some lines might be shorter than others. We assume that shorter lines
+       # are padded to the length of the longest line using symbol None. This
+       # makes sure that the sum of the symbol frequencies at each index equals
+       # the number of entries we have processed. In terms of entropy calculation
+       # this factors in the length of the entries.
+       for index in sorted(frequency.keys(), reverse=True):
+               frequency_sum = line_count;
+
+               for char in frequency[index].keys():
+                       frequency_sum -= frequency[index][char]
+               assert (frequency_sum >= 0),\
+                       "frequency_sum[%d] %d < 0" % (index, frequency_sum)
+
+               if (frequency_sum == 0):
+                       break
+
+               frequency[index][None] = frequency_sum
+
+       # Dictionary used to store the sum of symbol frequencies
+       # at each offset from the beginning of a token.
+       # Should be equal to number of tokens (see above).
+       frequency_sums = {}
+
+       # Dictionary of dictionaries used to store the probability
+       # of each symbol at each offset from the beginning of a token.
+       probability = {}
+
+       for index in frequency.keys():
+               # Assertion: sum up the frequencies of all symbols at each index.
+               # They must be equal to the number of entries processed, i.e., every
+               # entry must have at least one symbol (even None, see above) at every
+               # index.
+               if (not index in frequency_sums):
+                       frequency_sums[index] = 0
+                       for symbol in frequency[index].keys():
+                               frequency_sums[index] += frequency[index][symbol]
+                       assert (frequency_sums[index] == line_count)
+               #
+               for symbol in frequency[index].keys():
+                       if (not index in probability):
+                               probability[index] = {}
+                       probability[index][symbol] =\
+                               float(frequency[index][symbol]) / frequency_sums[index]
+
+       # Assertion: individual probabilities at each index must sum up to 1
+       for index in frequency.keys():
+               probability_sum = 0
+               for symbol in frequency[index].keys():
+                       probability_sum += probability[index][symbol]
+               # 0.999 and above get rounded up to 1.0
+               assert (int(probability_sum+0.001) == 1),\
+                       "probability_sum[%d] (%.100f) %d != 1" %\
+                               (index, probability_sum, int(probability_sum+0.001))
+
+       # Shannon Entropy
+       for line in input:
+               line = line.strip()
+
+               # Shannon Entropy
+               entropy = 0
+               for index in range(0, len(line)):
+                       symbol = line[index];
+
+                       entropy += probability[index][symbol] *\
+                               math.log(probability[index][symbol], 2);
+
+                       verbose and print("> '%s' [%d] P(%s)=%f F(%s)=%d/%d E[0:%d]=%f" %
+                               (line, index, symbol, probability[index][symbol],
+                               symbol, frequency[index][symbol], frequency_sums[index],
+                               index + 1, 0 - entropy), file=sys.stderr)
+
+               entropy = 0 - entropy
+
+               print("%s\t%f" % (line, entropy))
+
+
+# Input file is expected to be either:
+# a) a set of entries on individual lines
+# b) a cookies.sqlite database maintained by Firefox (moz_cookies table)
+#
+# Output is an equivalent number of lines with tab-separated values
+# where the left value is the original entry and the right value is
+# the calculated entropy in bits.
+
+def main(argv):
+
+       parser = argparse.ArgumentParser(description=
+               "Calculate entropy given a set of values" +
+                       "or a cookies.sqlite Firefox database.")
+
+       parser.add_argument("--verbose", "-v",
+               action="store_const", const=True, default=False,
+               help="output details on how entropy is calculated")
+
+       parser.add_argument("input", nargs=1,
+               help="name of the input file (e.g., cookie_values.txt or cookies.sqlite");
+
+       args = parser.parse_args()
+
+       root, ext = os.path.splitext(args.input[0])
+       if (ext == ".sqlite"):
+               conn = sqlite3.connect(args.input[0]) or die()
+               c = conn.cursor()
+               c.execute("SELECT value FROM moz_cookies")
+               entropy(
+                       map(lambda x: x[0].encode('utf-8', errors='strict') ,c.fetchall()),
+                       args.verbose)
+               conn.close()
+       else:
+               args.verbose and print("# NOTICE: Treating '%s' as a text file" %
+                               (args.input[0]), file=sys.stderr)
+               file = open(args.input[0], "r") or die()
+               entropy(file.readlines(), args.verbose)
+               file.close()
+
+
+if __name__ == "__main__":
+       main(sys.argv)
author	Georgios Kontaxis <redacted>
	Sat, 21 Mar 2015 20:32:30 +0000 (16:32 -0400)
committer	Georgios Kontaxis <redacted>
	Sat, 21 Mar 2015 20:32:30 +0000 (16:32 -0400)