#!/usr/bin/env python
#coding=utf-8
#
# Takes text from input files, counts the frequency of words and 2-word sequences, sorts them and creates 'tag cloud' style output. 
# 2008 by Sven-S Porst (ssp-web@earthlingsoft.net)
# Use at your own risk and sneer discreetely.
#
#from AppKit import *
# pb = NSPasteboard.generalPasteboard()
#input = pb.stringForType_(NSStringPboardType)
maxitems = 100
steps = 6

from math import *

#omitwords = ()
omitwords = ( 'a', 'an', 'and', 'be', 'by', 'as', 'at', 'can', 'for', 'had', 'has', 'he', 'her', 'him', 'his', 'i', 'in', 'in the',  'is', 'isn', 'it', 'mr', 'not', 'of', 'of the', 'on', 's', 'she', 'so', 't', 'that', 'the', 'this', 'to', 'was', 'with', 'you')
removestrings = (',', '.', '"', "'", '!', '?', ':', ';', '--', '(', ')', '[', ']', '–', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '’', 'ssp', '%')
worddict = {}
wordarray = []

def upCountForWordInDict(word, dictionary):
	smallword = word.lower()
	if smallword not in dictionary:
		dictionary[smallword] = [1, {word: 1}]
	else:
		record = dictionary[smallword]
		record[0] = record[0] + 1
		if word in record[1]:
			record[1][word] = record[1][word] + 1
		else:
			record[1][word] = 1
			
def clean (string):
	s = string
	for r in removestrings:		
		s = s.replace(r, ' ')
	return s	

def workon(theInput):
	previousword = ''
	s = clean(theInput)
	inputarray = s.split()
	for word in inputarray:
		if word not in omitwords:
			upCountForWordInDict(word, worddict)
			if previousword != '':
				upCountForWordInDict(previousword + ' ' + word, worddict)
			previousword = word

def countstring(number):
	if number == 1:
		return 'appears once'
	elif number == 2:
		return 'appears twice'
	else:
		return str(number) + ' occurrences'

def htmloutput(array):
	steps = 6
	maxnumber = array[0][0]
	alphabetisedarray = []
	for a in array[0:min(maxitems, len(array))]:
		alphabetisedarray.append((a[1], a[0]))
	alphabetisedarray.sort(cmp=lambda x,y: cmp(x[0].lower(), y[0].lower()))
	
	print '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">'
	print '<html><head><title>Tag Map</title>'
	print '<style type="text/css">'
	print '.ssptagcloud {line-height: 200%; text-align:center; padding:0px; }'
	print '.ssptagcloud li { display:inline; padding: 0em 0.2em; }'
	for i in range(0,steps):
		print '.ssptagcloud .size' + str(i) + ' { font-size: ' + str(100 + (i**1.1) * 200/steps) + '%;}'
	print '</style></head><body>'
	print '<ul class="ssptagcloud">'
	for a in alphabetisedarray:
		print '<li class="size' + str(int(floor((steps-1)*a[1]/maxnumber))) + '" title="' + countstring(a[1])  +'">' + a[0] + '</li>'
	print '</ul></body></html>'



import fileinput
for line in fileinput.input():
	workon(line)


for a in worddict.items():
	maxsofar = 0
	s = ''
	for spelling in a[1][1].items():
		if spelling[1] > maxsofar:
			s = spelling[0]
			maxsofar = spelling[1]	
	wordarray.append((a[1][0], s)) # gives count / most common spelling

	
wordarray.sort()
wordarray.reverse()

htmloutput(wordarray)