import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as pt
with open("nixon.txt") as textf:
    nixon = textf.read()
with open("clinton.txt") as textf:
    clinton = textf.read()
with open("rail-transport.txt") as textf:
    rail = textf.read()
print(nixon[:500])
How would you turn this into a vector?
nixon_vec = [ord(c) for c in nixon]
print nixon_vec[:300]
Let's try another way.
def split_into_words(s):
    import re
    return re.split("[., \n\t]+", s)
print split_into_words(nixon)[:300]
word_numbers = {}
def learn(s):
    for word in split_into_words(s):
        if word not in word_numbers:
            word_numbers[word] = len(word_numbers)
            
def string_to_vec(s):
    result = np.zeros(len(word_numbers))
    for word in split_into_words(s):
        result[word_numbers[word]] += 1
    return result
learn(nixon)
learn(clinton)
learn(rail)
nixon_v = string_to_vec(nixon)
clinton_v = string_to_vec(clinton)
rail_v = string_to_vec(rail)
Plotting could give some insight on this data.
pt.plot(nixon_v)
pt.plot(clinton_v)
pt.plot(rail_v)
Now we need a measure of similarity.
def angle_cos(x, y):
    return np.dot(x, y)/(la.norm(x)*la.norm(y))
print angle_cos(nixon_v, clinton_v)
print angle_cos(clinton_v, rail_v)
print angle_cos(nixon_v, rail_v)