import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as pt
with open("nixon.txt") as textf:
nixon = textf.read()
with open("clinton.txt") as textf:
clinton = textf.read()
with open("rail-transport.txt") as textf:
rail = textf.read()
print(nixon[:500])
How would you turn this into a vector?
nixon_vec = [ord(c) for c in nixon]
print nixon_vec[:300]
Let's try another way.
def split_into_words(s):
import re
return re.split("[., \n\t]+", s)
print split_into_words(nixon)[:300]
word_numbers = {}
def learn(s):
for word in split_into_words(s):
if word not in word_numbers:
word_numbers[word] = len(word_numbers)
def string_to_vec(s):
result = np.zeros(len(word_numbers))
for word in split_into_words(s):
result[word_numbers[word]] += 1
return result
learn(nixon)
learn(clinton)
learn(rail)
nixon_v = string_to_vec(nixon)
clinton_v = string_to_vec(clinton)
rail_v = string_to_vec(rail)
Plotting could give some insight on this data.
pt.plot(nixon_v)
pt.plot(clinton_v)
pt.plot(rail_v)
Now we need a measure of similarity.
def angle_cos(x, y):
return np.dot(x, y)/(la.norm(x)*la.norm(y))
print angle_cos(nixon_v, clinton_v)
print angle_cos(clinton_v, rail_v)
print angle_cos(nixon_v, rail_v)