From: Ben Limmer <redacted>
Date: Sun, 17 Feb 2013 20:51:49 +0000 (-0700)
Subject: Added ability to exclude domains.
X-Git-Url: http://git.99rst.org/?a=commitdiff_plain;h=997606b;p=stevenblack-hosts.git

Added ability to exclude domains.
This commit closes #1. I always hated that Hulu would freak out when I generated a new hosts file. It checks to make sure that it can access its ad servers before it will start streaming. I'm assuming people have other sites similar to this, so the script now can be extended with common and custom domains to exclude.
---

diff --git a/updateHostsFile.py b/updateHostsFile.py
index f0ac571e5..5b3a80db5 100644
--- a/updateHostsFile.py
+++ b/updateHostsFile.py
@@ -7,26 +7,38 @@
 # as sources into one, unique host file to keep you internet browsing happy.
 
 import os
+import re
 import string
 import sys
 import tempfile
 import urllib2
 
+# Project Settings
 BASEDIR_PATH = os.path.dirname(os.path.realpath(__file__))
 DATA_PATH = BASEDIR_PATH + '/data'
 DATA_FILENAMES = 'hosts'
 UPDATE_URL_FILENAME = 'update.info'
+SOURCES = os.listdir(DATA_PATH)
 
+# Exclusions
+EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
+
+# Common domains to exclude
+COMMON_EXCLUSIONS = ['hulu.com']
+
+# Global vars
+exclusionRegexs = []
 duplicatesRemoved = 0;
-sources = os.listdir(DATA_PATH)
 
 def main():
 	promptForUpdate()
+	promptForExclusions()
 	mergeFile = createInitialFile()
 	finalFile = removeDups(mergeFile)
 	finalizeFile(finalFile)
 	print 'Success! Your shiny new hosts file has been prepared.'
 
+# Prompt the User
 def promptForUpdate():
 	response = query_yes_no("Do you want to update all data sources?")
 	if (response == "yes"):
@@ -34,8 +46,54 @@ def promptForUpdate():
 	else:
 		print 'OK, we\'ll stick with what we\'ve  got locally.'
 
+def promptForExclusions():
+	response = query_yes_no("Do you want to exclude any domains?\n" +
+							"For example, hulu.com video streaming must be able to access " +
+							"its tracking and ad servers in order to play video.")
+	if (response == "yes"):
+		displayExclusionOptions()
+	else:
+		print 'OK, we won\'t exclude any domains.'
+# End Prompt the User
+
+# Exclusion logic
+def displayExclusionOptions():
+	for exclusionOption in COMMON_EXCLUSIONS:
+		response = query_yes_no("Do you want to exclude the domain " + exclusionOption + " ?")
+		if (response == "yes"):
+			excludeDomain(exclusionOption)
+		else:
+			continue
+	response = query_yes_no("Do you want to exclude any other domains?")
+	if (response == "yes"):
+		gatherCustomExclusions()
+
+def gatherCustomExclusions():
+	moreEntries = True;
+	while (moreEntries):
+		domainFromUser = raw_input("Enter the domain you want to exclude (e.g. facebook.com): ")
+		if (isValidDomainFormat(domainFromUser)):
+			excludeDomain(domainFromUser)
+		response = query_yes_no("Do you have more domains you want to enter?")
+        if (response == "no"):
+        	moreEntries = False
+
+def excludeDomain(domain):
+	print 'in exclude domain'
+	exclusionRegexs.append(re.compile(EXCLUSION_PATTERN + domain))
+
+def matchesExclusions(strippedRule):
+	strippedDomain = strippedRule.split()[1]
+	for exclusionRegex in exclusionRegexs:
+		if exclusionRegex.search(strippedDomain):
+			print 'Domain ' + strippedDomain + ' matched exclusions'
+			return True
+	return False
+# End Exclusion Logic
+
+# Update Logic
 def updateAllSources():
-	for source in sources:
+	for source in SOURCES:
 		updateURL = getUpdateURLFromFile(source)
 		if (updateURL == None):
 			continue;
@@ -59,10 +117,12 @@ def getUpdateURLFromFile(source):
 		print 'Warning: Can\'t find the update file for source ' + source
 		print 'Make sure that there\'s a file at ' + pathToUpdateFile
 	return retURL
+# End Update Logic
 
+# File Logic
 def createInitialFile():
 	mergeFile = tempfile.NamedTemporaryFile()
-	for source in sources:
+	for source in SOURCES:
 		curFile = open(DATA_PATH + '/' + source +'/' + DATA_FILENAMES, 'r')
 		mergeFile.write('\n# Begin ' + source + '\n')
 		mergeFile.write(curFile.read())
@@ -77,11 +137,13 @@ def removeDups(mergeFile):
 	rules_seen = set()
 	for line in mergeFile.readlines():
 		if line[0].startswith("#") or line[0] == '\n':
-			finalFile.write(line)
-			continue;
-		strippedRule = stripRule(line)
-		if strippedRule not in rules_seen:
 			finalFile.write(line) #maintain the comments for readability
+			continue
+		strippedRule = stripRule(line) #strip comments
+		if matchesExclusions(strippedRule):
+			continue
+		if strippedRule not in rules_seen:
+			finalFile.write(line)
 			rules_seen.add(strippedRule)
 		else:
 			duplicatesRemoved += 1
@@ -114,16 +176,16 @@ def writeOpeningHeader(finalFile):
 	finalFile.write('# with a dash of crowd sourcing via Github\n#\n')
 	finalFile.write('# Project home page: https://github.com/StevenBlack/hosts\n#\n')
 	finalFile.write('# Current sources:\n')
-	for source in sources:
+	for source in SOURCES:
 		finalFile.write('#    ' + source + '\n')
 	finalFile.write('#\n')
 	finalFile.write('# Take Note:\n')
 	finalFile.write('# Merging these sources produced ' + str(duplicatesRemoved) + ' duplicates\n')
 	finalFile.write('# ===============================================================\n')
 	finalFile.write(fileContents)
+# End File Logic
 
-
-# HELPER FUNCTIONS
+# Helper Functions
 ## {{{ http://code.activestate.com/recipes/577058/ (r2)
 def query_yes_no(question, default="yes"):
     """Ask a yes/no question via raw_input() and return their answer.
@@ -158,6 +220,14 @@ def query_yes_no(question, default="yes"):
                              "(or 'y' or 'n').\n")
 ## end of http://code.activestate.com/recipes/577058/ }}}
 
+def isValidDomainFormat(domain):
+	domainRegex = re.compile("www\d{0,3}[.]|https?")
+	if (domainRegex.match(domain)):
+		print "The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again."
+		return False
+	else:
+		return True
+# End Helper Functions
 
 if __name__ == "__main__":
 	main()
\ No newline at end of file