#NLP
import nltk
nltk.download()from nltk.tokenize import sent_tokenize, word_tokenize
# Best for European languages
text = "Hey Bob! What's the weather at 8 o'clock"
sent_tokenize(text)
# ['Hey Bob!', "What's the weather at 8 o'clock"]
word_tokenize(sent_tokenize(text)[1])
# ['What', "'s", 'the', 'weather', 'at', '8', "o'clock"]tokens = word_tokenize("I went to Paris to meet Bob")
nltk.pos_tag(tokens)
# [('I', 'PRP'),
# ('went', 'VBD'),
# ('to', 'TO'),
# ('Paris', 'NNP'),
# ('to', 'TO'),
# ('meet', 'VB'),
# ('Bob', 'NNP')]
nltk.ne_chunk(nltk.pos_tag(tokens), binary=True)
# Tree('S', [
# ('I', 'PRP'), ('went', 'VBD'), ('to', 'TO'),
# Tree('NE', [('Paris', 'NNP')]), ('to', 'TO'), ('meet', 'VB'),
# Tree('NE', [('Bob', 'NNP')]),
# ])POS tagger in NLTK isn't that great, if you want a good model, take a look at SyntaxNet
Word -> Stem (non-changing portion)
# The two most used stemmers
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer
snow = SnowballStemmer('english')
snow.stem("own") == snow.stem("owning") == snow.stem("owned")
# True
snow.stem("entities") == snow.stem("entity")
# TrueVelocity: Snowball > Porter
Perf: Porter > Snowball
Word -> Lemma (dictionary form)
from nltk.stem import WordNetLemmatizer
wordnet = WordNetLemmatizer()
wordnet.lemmatize("women")
# u'woman'
wordnet.lemmatize("marketing")
# 'marketing'
wordnet.lemmatize("markets")
# u'market'
snow.stem("marketing")
# u'market'
snow.stem("markets")
# u'market'/!\ Really slow /!\
from nltk.corpus import stopwords
len(stopwords.words('english'))
# 153
stopwords.words('english')[:20]
# [u'i',
# u'me',
# u'my',
# u'myself',
# u'we',
# u'our',
# u'ours',
# u'ourselves',
# u'you',
# u'your',
# u'yours',
# u'yourself',
# u'yourselves',
# u'he',
# u'him',
# u'his',
# u'himself',
# u'she',
# u'her',
# u'hers']
Most widely used:
Input: Corpus of text documents
Output: Matrix NxM with N = # of documents, M = # of unique words
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
vectorizer = CountVectorizer()
vectorizer.fit_transform(corpus).toarray()
# array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
# [0, 1, 0, 1, 0, 2, 1, 0, 1],
# [1, 0, 0, 0, 1, 0, 1, 1, 0],
# [0, 1, 1, 1, 0, 0, 1, 0, 1]])
vectorizer.get_feature_names()
# [u'and', u'document', u'first', u'is', u'one', u'second',
# u'the', u'third', u'this']Normalization of occurrence matrix:
Frequency of a word in a document, weighted by its rarity in the corpus
tf: reward for high occurrence in a document
idf: penalty if too much appearance in the corpus
(log term because, most of the time, words distribution across a corpus is a power law)
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',
]
vectorizer = TfidfVectorizer()
np.around(vectorizer.fit_transform(corpus).toarray(), decimals=2)
# array([[ 0. , 0.44, 0.54, 0.44, 0. , 0. , 0.36, 0. , 0.44],
# [ 0. , 0.27, 0. , 0.27, 0. , 0.85, 0.22, 0. , 0.27],
# [ 0.55, 0. , 0. , 0. , 0.55, 0. , 0.29, 0.55, 0. ],
# [ 0. , 0.44, 0.54, 0.44, 0. , 0. , 0.36, 0. , 0.44]])
vectorizer.get_feature_names()
# [u'and', u'document', u'first', u'is', u'one', u'second',
# u'the', u'third', u'this']from sklearn.feature_extraction.text import CountVectorizer
text = "word1 word2 word3 word4 word5"
CountVectorizer(ngram_range=(1,4)).build_analyzer()(text)
# [u'word1',
# u'word2',
# u'word3',
# u'word4',
# u'word5',
# u'word1 word2',
# u'word2 word3',
# u'word3 word4',
# u'word4 word5',
# u'word1 word2 word3',
# u'word2 word3 word4',
# u'word3 word4 word5',
# u'word1 word2 word3 word4',
# u'word2 word3 word4 word5']
# Do the same, just with Python
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])Embedding model learns to map each discrete word into a low-dimensional continuous vector-space from their distributional properties observed in some raw text corpus.
Premise: words next to each other are related
Two distinct models:
Sentence: "The quick brown fox jumps over the lazy dog"
Window = 1 Negative sample = 2
| Word | Context |
|---|---|
| the | quick |
| quick | the |
| quick | brown |
| brown | quick |
| brown | fox |
| [...] | [...] |
| Word | False Context |
|---|---|
| the | random_word1 |
| the | random_word2 |
| quick | random_word3 |
| quick | random_word4 |
| brown | random_word5 |
| [...] | [...] |
Positive Dataset D (label 1)
Negative Dataset D' (label 0)
Considering:
Corpus of words w ∈ W and their context c ∈ C
Parameters θ controlling the distribution P(D = 1|w, c; θ)
Vectorial representation of w and c:
Probability that a couple (w, c) belongs to D:
Objective:
Python library gensim: https://radimrehurek.com/gensim/models/word2vec.html
Main parameters:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])
# [('queen', 0.50882536), ...]
model.wv.doesnt_match("breakfast cereal dinner lunch".split())
# 'cereal'Usually, people use a lot of pre-trained Word2Vec models!
A lot of built-in functions:
model = Word2Vec(sentences, size=100, window=5, negative=5, alpha=0.025, min_count=10)Another embedding method:
Count-based models:
Learn their vectors by doing some dimensionality reduction on the co-occurence counts matrix.
Always the same objective: minimize some "construction loss" when trying to find the lower-dimensional representation which can explain most of the variance in the high-dimensional data.
tl;dr: normalizing the counts & log-smoothing
Weighting the counts around the window:
Sentence: "word1 word2 word3 word4"
Window: 2
| word1 | word2 | word3 | word4 | |
| word1 | 0 | 1 | 0.5 | 0 |
| word2 | 1 | 0 | 1 | 0.5 |
| word3 | 0.5 | 1 | 0 | 1 |
| word4 | 0 | 0.5 | 1 | 0 |
Based on this matrix, vectors are built using:
Where Xij is the element (i,j) of the co-occurence matrix
Weight function g:
Cost function:
Python library & pre-trained models:
tl;dr: Siasmese network
Using pre-trained GloVe (from stanford), feed (q1, q2) to LSTM model, concatene the resulting vectors into one, feed it to some fully connected layers
#unofficial
Looking for a data scientist intern for a full journey into the DS world: