# Using TF-IDF and cosine similarity to build a Christmas carol search engine

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

## Load the carols

In [2]:
# Read the data into Panda dataframe
file_name = "carols.csv"
df = pd.read_csv('./{}'.format(file_name))

## Inspect the data to check we have carols

In [3]:
# First few observations
df.head(10)

Unnamed: 0,Carol,Lyrics
0,Have Yourself A Merry Little Christmas,"Have yourself a merry little Christmas,\nLet y..."
1,I'll Be Home For Christmas,I'll be home for Christmas\nYou can count on m...
2,It's the most wonderful time of the year,It's the most wonderful time of the year.\nWit...
3,Jingle Bell Rock,"Jingle bell, jingle bell, jingle bell rock\nJi..."
4,Jingle Bells,Dashing through the snow\nOn a one-horse open ...
5,Let It Snow!,"Oh, the weather outside is frightful,\nBut the..."
6,O Christmas Tree,"O Christmas Tree,\nO Christmas Tree,\nHow stea..."
7,Rocking around the Christmas Tree,Rocking around the Christmas tree\nat the Chri...
8,Rudolph The Red-Nosed Reindeer,You know Dasher and Dancer\nAnd Prancer and Vi...
9,Santa Claus Is Coming To Town,You better watch out\nYou better not cry\nBett...


## Check for and remove carols with missing values

In [4]:
# Check for missing values in train data
missing_values = df.isnull().values.any()
if(missing_values):
    display(df[df.isnull().any(axis=1)])

In [5]:
# Remove records with missing values
df.dropna(inplace=True)

## Determine the term frequencies (TFs)

In [6]:
# Use a CountVectorizer to learn the terms and term frequencies across all of the documents (carols) 
cv = CountVectorizer(stop_words='english')
doc_term_matrix = cv.fit_transform(df['Lyrics'])

## Perform some simple analysis

In [7]:
# Number of documents vs number of terms 
doc_term_matrix.shape

(16, 550)

In [8]:
# Get the terms - unique words excluding single char words like "a"
cv.get_feature_names()

['aglow',
 'ago',
 'ain',
 'air',
 'allow',
 'appear',
 'arose',
 'ashes',
 'aside',
 'awake',
 'away',
 'bad',
 'bank',
 'bay',
 'beard',
 'beautiful',
 'bed',
 'beds',
 'befalls',
 'begun',
 'bell',
 'belling',
 'bells',
 'belly',
 'better',
 'bird',
 'birds',
 'blitzen',
 'blixem',
 'blowing',
 'bluebird',
 'bob',
 'bough',
 'boughs',
 'bound',
 'bow',
 'bowlful',
 'branches',
 'breast',
 'bright',
 'bring',
 'broad',
 'brought',
 'brown',
 'build',
 'bulb',
 'bundle',
 'bushels',
 'bye',
 'called',
 'calling',
 'came',
 'cap',
 'card',
 'care',
 'caroling',
 'carols',
 'checking',
 'cheeks',
 'cheer',
 'cherry',
 'chestnuts',
 'child',
 'children',
 'chilling',
 'chime',
 'chimney',
 'chin',
 'choir',
 'christmas',
 'christmases',
 'chubby',
 'circus',
 'clatter',
 'claus',
 'clime',
 'clock',
 'clothes',
 'clown',
 'columbus',
 'comet',
 'comfort',
 'coming',
 'conspire',
 'constant',
 'corn',
 'count',
 'couple',
 'coursers',
 'crack',
 'creature',
 'cupid',
 'danced',
 'dancer',

In [9]:
# Check the number of terms
len(cv.get_feature_names())

550

In [10]:
# View the word counts across all of the documents
word_counts = pd.DataFrame(doc_term_matrix.toarray(), index=df["Carol"], columns=cv.get_feature_names())
word_counts

Unnamed: 0_level_0,aglow,ago,ain,air,allow,appear,arose,ashes,aside,awake,...,word,work,wreath,write,year,years,yippee,yore,young,yule
Carol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Have Yourself A Merry Little Christmas,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
I'll Be Home For Christmas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
It's the most wonderful time of the year,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,0
Jingle Bell Rock,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Jingle Bells,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Let It Snow!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O Christmas Tree,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rocking around the Christmas Tree,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rudolph The Red-Nosed Reindeer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Santa Claus Is Coming To Town,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# View the most and least frequent words
word_counts.sum().sort_values(ascending=False)

In [None]:
# View the word counts for certain words
word_counts[["christmas", "jingle"]]

## Determine the inverse document frequencies (IDFs)

In [None]:
# We have the term frequencies, now determine the inverse document frequencies (IDFs)
idfs = TfidfTransformer() 
idfs.fit(doc_term_matrix)

In [None]:
# Create a data frame with the IDF values 
idfs_df = pd.DataFrame(idfs.idf_, index=cv.get_feature_names(), columns=["idfs"]) 
 
# Sort ascending and display
# High IDF (1/DF) terms are less frequent across all documents; low IDF terms are more frequent 
idfs_df.sort_values(by=['idfs'], ascending=False)

## Put it all together to calculate the TF-IDFs

In [None]:
# We have the term frequencies and inverse document frequencies - now calculate the TF-IDF scores
tf_idfs = idfs.transform(doc_term_matrix)

## Do some more analysis

In [None]:
# Create a data frame to view the TF-IDF scores for the first document, tf_idfs[0]
tf_idf_doc0 = pd.DataFrame(tf_idfs[0].T.todense(), index=cv.get_feature_names(), columns=["tf-idf"])
tf_idf_doc0.sort_values(by=["tf-idf"], ascending=False)

In [None]:
# Create a data frame to view all of the TF-IDF scores
tf_idf_all_docs = pd.DataFrame(tf_idfs.T.todense(), index=cv.get_feature_names())
tf_idf_all_docs

In [None]:
# Nicer if we re-orientate the scores so they're displayed in the same ways as the term frequencies at the top
# Use np.transpose to swap array rows and columns
tf_idf_all_docs_nicer = pd.DataFrame(np.transpose(tf_idfs.T.toarray()), index=df["Carol"], columns=cv.get_feature_names())
tf_idf_all_docs_nicer

In [None]:
# Even better, let's just display the TF-IDFs for certain words of interest
tf_idf_all_docs_nicer[["christmas", "jingle"]]

## Now prepare a search query

In [None]:
# Now let's perform a simple query that looks for the following words
query = "red rocking reindeer"

# Calculate term frequencies for the query using terms found across all of the documents
query_term_matrix = cv.transform([query])

In [None]:
# Across all of the terms, view the word counts for the query
query_counts = pd.DataFrame(query_term_matrix.toarray(), columns=cv.get_feature_names())

# Query term counts, showing all terms within the documents
# query_counts

# Query term counts, showing just the query terms (shows what we know already of course)
query_counts[query.split(" ")]

## Calculate the cosine similarity between the TF-IDFs and the query words 

In [None]:
# Calculate the cosine similarity between the vector of each document and the query vector
results = cosine_similarity(tf_idfs, query_term_matrix)
results

In [None]:
results = results.reshape((-1,))
results

## Show the results

In [None]:
# Print the top 10 search results - voila, hopefully!
# Dan's note to self:
# argsort sorts an array in asc order, and then returns the indexes of the sorted values
# Useful slice notation reference: https://stackoverflow.com/questions/509211/understanding-slice-notation 
# [:-11:-1] returns the last 10 items, in reverse order
print("Search results for: '{}'".format(query))
for i in results.argsort()[:-11:-1]:
    if results[i] > 0:
        print("Carol {}. {} {}%".format(i, df.iloc[i,0], round(100*results[i])))