# pyceptron.py: an object-oriented implementation of the batch perceptron algorithm.
# Designed for binary classifications. Includes training, testing and cross-validation 
# functions.
#
# This software is (c) 2005 by Meredith L. Patterson, and is released under the
# terms of the GNU General Public License, version 2, or any later version at
# your discretion. It is provided free of charge and with NO WARRANTY 
# WHATSOEVER. If it breaks, congratulations: you get to keep both halves.

from string import *
import random, copy, math

class Pyceptron:
	def __init__(self, weight=[]):
		"""
		Initialize a new, untrained perceptron with an optional starting weight vector.
		"""
		self.weight = weight
		
	def readlines(self, filename):
		"""
		Read data from a file, and generate a dataset for use in training or testing.
		
		The file must be of the format
			x 1:a 2:b 3:c 4:d ...
		where x is either +1 or -1, and a, b, c, ... are ints or floats. If some 
		values in the list are 0, you can omit them, e.g.
			-1 1:.03521 2:1 5:-.842 ...
		and they will be filled in for you. However, the values MUST be in order
		(ie, you cannot have 1:a 4:d 2:b), and the last value in the first line
		MUST have the largest index that will be seen in the dataset (ie, if line
		0 stops at 13:m, no subsequent line may have a vector element of index
		greater than 13).

		A dataset is a list of pairs, where each pair consists of the class into which
		the element falls and the (non-normalized) vector describing the element.
		"""
		infile = file(filename, 'r')
		set = []
		for line in infile:
			line = split(strip(line))
			ret = []
			for thing in line[1:]:
				newthing = split(thing, ':')
				if (int(newthing[0]) > line.index(thing)):
					while len(ret) < int(newthing[0])-1:
						ret.append(0.0)
				ret.append(float(newthing[1]))
			foo = [int(line[0]), ret]
			set.append(foo)
		return set

	def signum(self, x):
		"""
		Not quite your standard signum function; returns 1 if x >= 0, -1 otherwise.
		"""
		if isinstance(x, (int, float)):
			if x > 0:
				return 1
			elif x <= 0:
				return -1
		else:
			print "Error: argument to signum must be int or float"
			return None
		
	def train(self, A, delta, theta, max_iterations):
		"""
		Train the perceptron using training data (read in using readlines()),
		and give the perceptron the resulting weight vector.
		
		A is the training set, delta is the learning factor, theta is the 
		threshold (terminate training once the error value is within this
		tolerance), and max_iterations is the maximum number of times to 
		update the weight vector -- crucial if our data are not linearly 
		separable!
		"""
		if (len(A[0]) != 2):
			print "Error: not a training set. Check your input."
			return None

		A1 = copy.deepcopy(A)  # so that we don't consume the input, and can reuse A
	
		"""
		First, normalize the vectors by reversing the sign of each vector
		element for each vector that falls into the -1 class. Then strip out
		all the class information, because now every vector falls into the
		+1 class. We need to generate a weight vector that makes every 
		vector in the normalised set fall on the same side of the hyperplane.
		"""	
		for thing in A1:
			if (thing[0] == -1):
				thing[1] = [-x for x in thing[1]]
			del(thing[0])
		A1 = [x for [x] in A1]
		
		# Initialize the weight vector to all ones, of the same magnitude as each training vector.
		self.weight = [1.0 for x in A1[0]]
		"""
		The following is somewhat ugly because of Python's lack of a do/while loop.
		The set is partitioned based on the initial weight vector we established above,
		then we sum the vectors that have been misclassified. Next, take the norm of the 
		resulting vector. At this point, the while loop starts; it terminates when the 
		value of the norm is within the threshold, theta, or when we've looped as many 
		times as specified in max_iterations.
		"""
		results = [self.signum(sum(map((lambda x, y: x*y), self.weight, a))) for a in A1]
		error = [0 for x in A1[0]]
		for x in range(len(results)):
			if (results[x] < 1):
				for y in range(len(A1[x])):
					error[y] += delta * A1[x][y]
		errorsquared = [x**2 for x in error]
		errval = math.sqrt(sum(errorsquared))
		iterations = 0
		while (errval > theta and iterations < max_iterations):
			for x in range(len(error)):
				self.weight[x] += error[x]
			error = [0 for x in A1[0]]
			results = [self.signum(sum(map((lambda x, y: x*y), self.weight, a))) for a in A1]
			for x in range(len(results)):
				if (results[x] < 1):
					for y in range(len(A1[x])):
						error[y] = delta * A1[x][y]
			errorsquared = [x**2 for x in error]
			errval = math.sqrt(sum(errorsquared))
			iterations += 1
		print "Took",iterations,"iterations" # Just in case you care. If this were a C function, it would return void.

	def F1(self, A, w = []):
		"""
		You can specify a weight vector if you want, but it makes more sense to train
		and use the perceptron's weight vector.
		"""
		if w == []:
			w = self.weight
		"""
		Test the perceptron over a testing set, which must be of the form
		  [[class, [x1, x2, x3...]]
		   [class, [x1, x2, x3...]]
		   [class, [x1, x2, x3...]]]
		and will throw an error otherwise. From this we calculate the precision and 
		recall, and return the F1 measure.
		"""
		if (len(A[0]) != 2):
			print "Error: not a testing set. Check your input."
			return None
		trupos = 0
		truneg = 0
		fpos = 0
		fneg = 0

		"""
		The following line sums (a-transposed)(w) for every vector in the set.
		Python list comprehensions are *fast*, which is why we're doing it this
		way. In the end, it's rather more like sum notation than a for loop 
		would be. :)
		"""
		results = [self.signum(sum(map((lambda x, y: x*y), a[1], w))) for a in A]
		
		"""
		Total up the number of true and false positives and negatives we've seen,
		by comparing our results to the actual values.
		"""
		for x, y in zip(A, results):
			if ((x[0] == 1) and (y == 1)): trupos += 1
			elif ((x[0] == 1) and (y == -1)): fneg += 1
			elif ((x[0] == -1) and (y == 1)): fpos += 1
			elif ((x[0] == -1) and (y == -1)): truneg += 1
			else: print "Error in F1 - we should never have gotten here."

		precision = float(trupos) / float(trupos + fpos)
		recall = float(trupos) / float(trupos + fneg)
		
		return (2*precision*recall/(precision + recall))

	def Ave_F1(self, A, folds, w = []):
		if w == []:
			w = self.weight
		F1s = []
		random.shuffle(A) # I  built-in functions like this.
		for x in range(folds):
			teststart = x*(int(math.ceil(float(len(A))/folds)))
			testend = min( (x+1)*(int(math.ceil(float(len(A))/folds)))-1, len(A)-1)
			testset = A[teststart:testend]
			trainingset = copy.deepcopy(A)
			del(trainingset[teststart:testend])
			self.train(trainingset, 1, .1, 3*len(trainingset))
			F1s.append(self.F1(testset, w))
		return float(sum(F1s))/folds