# pyceptron.py: an object-oriented implementation of the batch perceptron algorithm. # Designed for binary classifications. Includes training, testing and cross-validation # functions. # # This software is (c) 2005 by Meredith L. Patterson, and is released under the # terms of the GNU General Public License, version 2, or any later version at # your discretion. It is provided free of charge and with NO WARRANTY # WHATSOEVER. If it breaks, congratulations: you get to keep both halves. from string import * import random, copy, math class Pyceptron: def __init__(self, weight=[]): """ Initialize a new, untrained perceptron with an optional starting weight vector. """ self.weight = weight def readlines(self, filename): """ Read data from a file, and generate a dataset for use in training or testing. The file must be of the format x 1:a 2:b 3:c 4:d ... where x is either +1 or -1, and a, b, c, ... are ints or floats. If some values in the list are 0, you can omit them, e.g. -1 1:.03521 2:1 5:-.842 ... and they will be filled in for you. However, the values MUST be in order (ie, you cannot have 1:a 4:d 2:b), and the last value in the first line MUST have the largest index that will be seen in the dataset (ie, if line 0 stops at 13:m, no subsequent line may have a vector element of index greater than 13). A dataset is a list of pairs, where each pair consists of the class into which the element falls and the (non-normalized) vector describing the element. """ infile = file(filename, 'r') set = [] for line in infile: line = split(strip(line)) ret = [] for thing in line[1:]: newthing = split(thing, ':') if (int(newthing[0]) > line.index(thing)): while len(ret) < int(newthing[0])-1: ret.append(0.0) ret.append(float(newthing[1])) foo = [int(line[0]), ret] set.append(foo) return set def signum(self, x): """ Not quite your standard signum function; returns 1 if x >= 0, -1 otherwise. """ if isinstance(x, (int, float)): if x > 0: return 1 elif x <= 0: return -1 else: print "Error: argument to signum must be int or float" return None def train(self, A, delta, theta, max_iterations): """ Train the perceptron using training data (read in using readlines()), and give the perceptron the resulting weight vector. A is the training set, delta is the learning factor, theta is the threshold (terminate training once the error value is within this tolerance), and max_iterations is the maximum number of times to update the weight vector -- crucial if our data are not linearly separable! """ if (len(A[0]) != 2): print "Error: not a training set. Check your input." return None A1 = copy.deepcopy(A) # so that we don't consume the input, and can reuse A """ First, normalize the vectors by reversing the sign of each vector element for each vector that falls into the -1 class. Then strip out all the class information, because now every vector falls into the +1 class. We need to generate a weight vector that makes every vector in the normalised set fall on the same side of the hyperplane. """ for thing in A1: if (thing[0] == -1): thing[1] = [-x for x in thing[1]] del(thing[0]) A1 = [x for [x] in A1] # Initialize the weight vector to all ones, of the same magnitude as each training vector. self.weight = [1.0 for x in A1[0]] """ The following is somewhat ugly because of Python's lack of a do/while loop. The set is partitioned based on the initial weight vector we established above, then we sum the vectors that have been misclassified. Next, take the norm of the resulting vector. At this point, the while loop starts; it terminates when the value of the norm is within the threshold, theta, or when we've looped as many times as specified in max_iterations. """ results = [self.signum(sum(map((lambda x, y: x*y), self.weight, a))) for a in A1] error = [0 for x in A1[0]] for x in range(len(results)): if (results[x] < 1): for y in range(len(A1[x])): error[y] += delta * A1[x][y] errorsquared = [x**2 for x in error] errval = math.sqrt(sum(errorsquared)) iterations = 0 while (errval > theta and iterations < max_iterations): for x in range(len(error)): self.weight[x] += error[x] error = [0 for x in A1[0]] results = [self.signum(sum(map((lambda x, y: x*y), self.weight, a))) for a in A1] for x in range(len(results)): if (results[x] < 1): for y in range(len(A1[x])): error[y] = delta * A1[x][y] errorsquared = [x**2 for x in error] errval = math.sqrt(sum(errorsquared)) iterations += 1 print "Took",iterations,"iterations" # Just in case you care. If this were a C function, it would return void. def F1(self, A, w = []): """ You can specify a weight vector if you want, but it makes more sense to train and use the perceptron's weight vector. """ if w == []: w = self.weight """ Test the perceptron over a testing set, which must be of the form [[class, [x1, x2, x3...]] [class, [x1, x2, x3...]] [class, [x1, x2, x3...]]] and will throw an error otherwise. From this we calculate the precision and recall, and return the F1 measure. """ if (len(A[0]) != 2): print "Error: not a testing set. Check your input." return None trupos = 0 truneg = 0 fpos = 0 fneg = 0 """ The following line sums (a-transposed)(w) for every vector in the set. Python list comprehensions are *fast*, which is why we're doing it this way. In the end, it's rather more like sum notation than a for loop would be. :) """ results = [self.signum(sum(map((lambda x, y: x*y), a[1], w))) for a in A] """ Total up the number of true and false positives and negatives we've seen, by comparing our results to the actual values. """ for x, y in zip(A, results): if ((x[0] == 1) and (y == 1)): trupos += 1 elif ((x[0] == 1) and (y == -1)): fneg += 1 elif ((x[0] == -1) and (y == 1)): fpos += 1 elif ((x[0] == -1) and (y == -1)): truneg += 1 else: print "Error in F1 - we should never have gotten here." precision = float(trupos) / float(trupos + fpos) recall = float(trupos) / float(trupos + fneg) return (2*precision*recall/(precision + recall)) def Ave_F1(self, A, folds, w = []): if w == []: w = self.weight F1s = [] random.shuffle(A) # Ibuilt-in functions like this. for x in range(folds): teststart = x*(int(math.ceil(float(len(A))/folds))) testend = min( (x+1)*(int(math.ceil(float(len(A))/folds)))-1, len(A)-1) testset = A[teststart:testend] trainingset = copy.deepcopy(A) del(trainingset[teststart:testend]) self.train(trainingset, 1, .1, 3*len(trainingset)) F1s.append(self.F1(testset, w)) return float(sum(F1s))/folds