Mantas Zimnickas

PyConLT'17

# n-gram

Sequence: abcde

n-grams:

1-gram: a, b, c, d, e
2-gram: ab, bc, cd, de
3-gram: abc, bcd, cde
4-gram: abcd, bcde
5-gram: abcde

# Python REPL

>>> s = 'abcde'
>>> n = 2

# Duck typing (list)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

# Duck typing (tuple)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

>>> tuple(s)
('a', 'b', 'c', 'd', 'e')

# Duck typing (iterator)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

>>> tuple(s)
('a', 'b', 'c', 'd', 'e')

>>> iter(s)
<str_iterator at 0x7f3f59c0f908>

# Slicing

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
[i:j)
$[i:j)$

# Range

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> list(range(len(s)))
[0, 1, 2, 3, 4]
[i, j)
$[i, j)$

# List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s))]
[(0, 2), (1, 3), (2, 4), (3, 5), (4, 6)]


# List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s) - n)]
[(0, 2), (1, 3), (2, 4)]

# List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s) - n + 1)]
[(0, 2), (1, 3), (2, 4), (3, 5)]

# It works!

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [s[i:i + n] for i in range(len(s) - n + 1)]
['ab', 'bc', 'cd', 'de']


# Enter the zip!

>>> s = 'abcde'
>>> n = 2

>>> zip(s, s[1:])
<zip at 0x7f18a48a5f48>

# Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

# Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

# Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

>>> next(g)
('b', 'c')

# Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

>>> next(g)
('b', 'c')

>>> list(g)
[('c', 'd'), ('d', 'e')]

# Bigram using zip

>>> s = 'abcde'
>>> n = 2

>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

# n-gram using zip?

>>> s = 'abcde'
>>> n = 2

>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(zip(s, s[1:], s[2:]))
[('a', 'b', 'c'), ('b', 'c', 'd'), ('c', 'd', 'e')]

# Argument unpacking

>>> s = 'abcde'
>>> n = 2

>>> print('a', 'b', 'c', 'd', 'e')
a b c d e

>>> print(*s)
a b c d e

>>> print(*range(len(s)))
0 1 2 3 4

# n-gram using zip!

>>> s = 'abcde'
>>> n = 2

>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

# map

>>> s = 'abcde'
>>> n = 2

>>> list(map(''.join, zip(*[s[i:] for i in range(n)])))
['ab', 'bc', 'cd', 'de']

>>> list(map(str.upper, s))
['A', 'B', 'C', 'D', 'E']

>>> ''.join(map(str.upper, s))
'ABCDE'

# Why not [].join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

# Why not built-in join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

# Why ''.join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

>>> ', '.join(s)
'a, b, c, d, e'

# Why ''.join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

>>> ', '.join(s)
'a, b, c, d, e'

>>> list(map(''.join, zip(s, s[1:])))
['ab', 'bc', 'cd', 'de']

# Memory usage

>>> s = 'abcde'
>>> n = 2

>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

# Lazy evaluation

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

# islice

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(islice(s, 1, None)) == list(s[1:None])
True

# tee

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(map(next, tee(iter(s), 3)))
['a', 'a', 'a']

# deque

>>> s = 'abcde'
>>> n = 2

>>> q = deque(maxlen=n)
>>> g = (q for x in map(q.append, s) if len(q) == n)
>>> list(map(''.join, g))
['ab', 'bc', 'cd', 'de']

# Let's analyze Wikipedia!

def wikipedia(url):
with urlopen(url) as f:
f = bz2.open(f, 'rb')
for event, elem in etree.iterparse(f):
if elem.tag == 'text':
yield from ngram(elem.text)
elem.clear()

>>> url = ('https://dumps.wikimedia.org/ltwiki/latest'
'/ltwiki-latest-pages-articles.xml.bz2')
>>> list(islice(wikipedia(url), 5))
['{{', '{p', 'po', 'or', 'rt']

# We need a progress bar

def wikipedia(url):
with urlopen(url) as f:
f = bz2.open(f, 'rb')
for event, elem in tqdm(etree.iterparse(f)):
if elem.tag == 'text':
yield from ngram(elem.text)
elem.clear()

>>> list(islice(wikipedia(url), 5))
1it [00:00,  2.02it/s]
['{{', '{p', 'po', 'or', 'rt']

# Object proxy

class TqdmProxy(wrapt.ObjectProxy):
def __init__(self, wrapped, p):
super().__init__(wrapped)
self._self_p = p

self._self_p.update(amt)
return self.__wrapped__.read(amt)

# We have a progress bar!

def wikipedia(url):
with urlopen(url) as f:
with tqdm(unit='B', unit_scale=True,
total=length) as p:
f = bz2.open(TqdmProxy(f, p), 'rb')
for event, elem in etree.iterparse(f):
if elem.tag == 'text':
yield from ngram(elem.text)
elem.clear()

>>> list(islice(wikipedia(url), 5))
0%|                  | 238K/150M [00:00<05:15, 474KB/s]
['{{', '{p', 'po', 'or', 'rt']


# Markov Chain

counts = sorted(
(a, b, c)
for (a, b), c in Counter(wikipedia(url)).items()
)

groups = {
k: tuple(zip(*g))[1:]
for k, g in groupby(counts, key=itemgetter(0))
}

# Machine learning

Entry = namedtuple('Entry', (
'total',
'cumsum',
'letters',
))

model = {
k: Entry(
total=sum(counts),
cumsum=list(accumulate(counts)),
letters=dict(zip(accumulate(counts), letters)),
) for k, (letters, counts) in groups.items()
}

# Lithuanina language generator

def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]

# Lithuanina language generator

def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]

# Lithuanina language generator

def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]

# Lithuanina language generator

def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]

# It can speak!

>>> print(*islice(speak(model), 512), sep='')
Alykymijo la tren Kava. Odilybūgien ham. Hizdiu, us giepuoly lainijotas U Vila kal Vių Brm f tas Ron rorėjo mišinyrto lijalatubo detylų suo turdasro FFCys O kan airentaire gytašos tenud įgisa Bopaf. SBo spas vate minijolios Mi ugria kių Junainisefrari!! kira Bių Vatas kdis fom.ch disije ceiniestous Sihtalą alos l varbrditiks. No Pa vų Atrakių ioblų marikusoldyd gaginus dia ga II aspym S. jainojį gitesejinripa, SubioVaiestvitantim. Bljos tence. Kastų Šarorim tulinanyvs taros gememiko i Radaigosmes Matair.
>>> print(*islice(speak(model), 30), sep='')

# Thank you, for your attention.

#### pycon17

By Mantas Zimnickas

• 1,621