In [9]:
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as pt
In [10]:
with open("nixon.txt") as textf:
    nixon = textf.read()
with open("clinton.txt") as textf:
    clinton = textf.read()
with open("rail-transport.txt") as textf:
    rail = textf.read()
In [11]:
print(nixon[:500])

Richard Nixon
From Wikipedia, the free encyclopedia
Richard Nixon
Richard M. Nixon, ca. 1935 - 1982 - NARA - 530679.jpg
37th President of the United States
In office
January 20, 1969 – August 9, 1974
Vice President  

    Spiro Agnew (1969–1973)
    None (Oct–Dec 1973)
    Gerald Ford (1973–1974)

Preceded by     Lyndon Johnson
Succeeded by    Gerald Ford
36th Vice President of the United States
In office
January 20, 1953 – January 20, 1961
President       Dwight Eisenhower
Preceded b

How would you turn this into a vector?

In [12]:
nixon_vec = [ord(c) for c in nixon]
In [13]:
print nixon_vec[:300]
[10, 82, 105, 99, 104, 97, 114, 100, 32, 78, 105, 120, 111, 110, 10, 70, 114, 111, 109, 32, 87, 105, 107, 105, 112, 101, 100, 105, 97, 44, 32, 116, 104, 101, 32, 102, 114, 101, 101, 32, 101, 110, 99, 121, 99, 108, 111, 112, 101, 100, 105, 97, 10, 82, 105, 99, 104, 97, 114, 100, 32, 78, 105, 120, 111, 110, 10, 82, 105, 99, 104, 97, 114, 100, 32, 77, 46, 32, 78, 105, 120, 111, 110, 44, 32, 99, 97, 46, 32, 49, 57, 51, 53, 32, 45, 32, 49, 57, 56, 50, 32, 45, 32, 78, 65, 82, 65, 32, 45, 32, 53, 51, 48, 54, 55, 57, 46, 106, 112, 103, 10, 51, 55, 116, 104, 32, 80, 114, 101, 115, 105, 100, 101, 110, 116, 32, 111, 102, 32, 116, 104, 101, 32, 85, 110, 105, 116, 101, 100, 32, 83, 116, 97, 116, 101, 115, 10, 73, 110, 32, 111, 102, 102, 105, 99, 101, 10, 74, 97, 110, 117, 97, 114, 121, 32, 50, 48, 44, 32, 49, 57, 54, 57, 32, 226, 128, 147, 32, 65, 117, 103, 117, 115, 116, 32, 57, 44, 32, 49, 57, 55, 52, 10, 86, 105, 99, 101, 32, 80, 114, 101, 115, 105, 100, 101, 110, 116, 32, 32, 10, 10, 32, 32, 32, 32, 83, 112, 105, 114, 111, 32, 65, 103, 110, 101, 119, 32, 40, 49, 57, 54, 57, 226, 128, 147, 49, 57, 55, 51, 41, 10, 32, 32, 32, 32, 78, 111, 110, 101, 32, 40, 79, 99, 116, 226, 128, 147, 68, 101, 99, 32, 49, 57, 55, 51, 41, 10, 32, 32, 32, 32, 71, 101, 114, 97, 108, 100, 32, 70, 111, 114, 100, 32, 40, 49, 57, 55, 51, 226, 128]

Let's try another way.

In [14]:
def split_into_words(s):
    import re
    return re.split("[., \n\t]+", s)
In [17]:
print split_into_words(nixon)[:300]
['', 'Richard', 'Nixon', 'From', 'Wikipedia', 'the', 'free', 'encyclopedia', 'Richard', 'Nixon', 'Richard', 'M', 'Nixon', 'ca', '1935', '-', '1982', '-', 'NARA', '-', '530679', 'jpg', '37th', 'President', 'of', 'the', 'United', 'States', 'In', 'office', 'January', '20', '1969', '\xe2\x80\x93', 'August', '9', '1974', 'Vice', 'President', 'Spiro', 'Agnew', '(1969\xe2\x80\x931973)', 'None', '(Oct\xe2\x80\x93Dec', '1973)', 'Gerald', 'Ford', '(1973\xe2\x80\x931974)', 'Preceded', 'by', 'Lyndon', 'Johnson', 'Succeeded', 'by', 'Gerald', 'Ford', '36th', 'Vice', 'President', 'of', 'the', 'United', 'States', 'In', 'office', 'January', '20', '1953', '\xe2\x80\x93', 'January', '20', '1961', 'President', 'Dwight', 'Eisenhower', 'Preceded', 'by', 'Alben', 'Barkley', 'Succeeded', 'by', 'Lyndon', 'Johnson', 'United', 'States', 'Senator', 'from', 'California', 'In', 'office', 'December', '4', '1950', '\xe2\x80\x93', 'January', '1', '1953', 'Preceded', 'by', 'Sheridan', 'Downey', 'Succeeded', 'by', 'Thomas', 'Kuchel', 'Member', 'of', 'the', 'U', 'S', 'House', 'of', 'Representatives', 'from', "California's", '12th', 'district', 'In', 'office', 'January', '3', '1947', '\xe2\x80\x93', 'December', '1', '1950', 'Preceded', 'by', 'Jerry', 'Voorhis', 'Succeeded', 'by', 'Patrick', 'Hillings', 'Personal', 'details', 'Born', 'Richard', 'Milhous', 'Nixon', 'January', '9', '1913', 'Yorba', 'Linda', 'California', 'U', 'S', 'Died', 'April', '22', '1994', '(aged', '81)', 'New', 'York', 'City', 'New', 'York', 'U', 'S', 'Resting', 'place', 'Richard', 'Nixon', 'Presidential', 'Library', 'and', 'Museum', 'Yorba', 'Linda', 'California', 'U', 'S', 'Political', 'party', 'Republican', 'Spouse(s)', 'Pat', 'Ryan', '(1940\xe2\x80\x931993;', 'her', 'death)', 'Children', 'Tricia', 'Nixon', 'Julie', 'Nixon', 'Alma', 'mater', 'Whittier', 'College', 'Duke', 'University', 'School', 'of', 'Law', 'Profession', 'Lawyer', 'Religion', 'Quaker', 'Nickname', 'Dick', 'Signature', 'Cursive', 'signature', 'in', 'ink', 'Military', 'service', 'Allegiance', 'United', 'States', 'of', 'America', 'Service/branch', 'United', 'States', 'Navy', 'Years', 'of', 'service', '1942\xe2\x80\x931946', 'Rank', 'US-O4', 'insignia', 'svg', 'Lieutenant', 'commander', 'Battles/wars', 'World', 'War', 'II', '\xe2\x80\xa2', 'Pacific', 'War', 'Awards', 'American', 'Campaign', 'Medal', 'Asiatic-Pacific', 'Campaign', 'Medal', '(2', 'service', 'stars)', 'World', 'War', 'II', 'Victory', 'Medal', 'Richard', 'Milhous', 'Nixon', '(January', '9', '1913', '\xe2\x80\x93', 'April', '22', '1994)', 'was', 'the', '37th', 'President', 'of', 'the', 'United', 'States', 'serving', 'from', '1969', 'to', '1974', 'when', 'he', 'became', 'the', 'only', 'president', 'to', 'resign', 'the', 'office', 'Nixon', 'had', 'previously', 'served', 'as', 'a', 'Republican', 'U', 'S', 'Representative', 'and', 'Senator', 'from', 'California', 'and', 'as']

In [18]:
word_numbers = {}

def learn(s):
    for word in split_into_words(s):
        if word not in word_numbers:
            word_numbers[word] = len(word_numbers)
            
def string_to_vec(s):
    result = np.zeros(len(word_numbers))
    for word in split_into_words(s):
        result[word_numbers[word]] += 1
    return result
In [19]:
learn(nixon)
learn(clinton)
learn(rail)

nixon_v = string_to_vec(nixon)
clinton_v = string_to_vec(clinton)
rail_v = string_to_vec(rail)

Plotting could give some insight on this data.

In [20]:
pt.plot(nixon_v)
Out[20]:
[<matplotlib.lines.Line2D at 0x7f2970fb1510>]
In [21]:
pt.plot(clinton_v)
Out[21]:
[<matplotlib.lines.Line2D at 0x7f2970deaf90>]
In [22]:
pt.plot(rail_v)
Out[22]:
[<matplotlib.lines.Line2D at 0x7f2970d2bbd0>]

Now we need a measure of similarity.

In [23]:
def angle_cos(x, y):
    return np.dot(x, y)/(la.norm(x)*la.norm(y))
In [24]:
print angle_cos(nixon_v, clinton_v)
0.858089212872

In [27]:
print angle_cos(clinton_v, rail_v)
0.835685969275

In [28]:
print angle_cos(nixon_v, rail_v)
0.847799936746