import antigravity

Mantas Zimnickas

PyConLT'17

//xkcd.com/353/

Slides

https://slides.com/sirex/pycon17

Jupyter notebook

http://nbviewer.jupyter.org/gist/sirex/14cbcea85a3629bd6cf1b2955f8d2e41

n-gram

Sequence: abcde

n-grams:

1-gram: a, b, c, d, e
2-gram: ab, bc, cd, de
3-gram: abc, bcd, cde
4-gram: abcd, bcde
5-gram: abcde

Python REPL

>>> s = 'abcde'
>>> n = 2

Duck typing (list)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

Duck typing (tuple)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

>>> tuple(s)
('a', 'b', 'c', 'd', 'e')

Duck typing (iterator)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

>>> tuple(s)
('a', 'b', 'c', 'd', 'e')

>>> iter(s)
<str_iterator at 0x7f3f59c0f908>

Slicing

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

[i:j)

[i:j)

Range

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> list(range(len(s)))
[0, 1, 2, 3, 4]

[i, j)

[i, j)

List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s))]
[(0, 2), (1, 3), (2, 4), (3, 5), (4, 6)]

List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s) - n)]
[(0, 2), (1, 3), (2, 4)]

List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s) - n + 1)]
[(0, 2), (1, 3), (2, 4), (3, 5)]

It works!

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [s[i:i + n] for i in range(len(s) - n + 1)]
['ab', 'bc', 'cd', 'de']

Enter the zip!

>>> s = 'abcde'
>>> n = 2

>>> zip(s, s[1:])
<zip at 0x7f18a48a5f48>

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

>>> next(g)
('b', 'c')

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

>>> next(g)
('b', 'c')

>>> list(g)
[('c', 'd'), ('d', 'e')]

Bigram using zip

>>> s = 'abcde'
>>> n = 2

>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

n-gram using zip?

>>> s = 'abcde'
>>> n = 2

>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(zip(s, s[1:], s[2:]))
[('a', 'b', 'c'), ('b', 'c', 'd'), ('c', 'd', 'e')]

Argument unpacking

>>> s = 'abcde'
>>> n = 2

>>> print('a', 'b', 'c', 'd', 'e')
a b c d e

>>> print(*s)
a b c d e

>>> print(*range(len(s)))
0 1 2 3 4

n-gram using zip!

>>> s = 'abcde'
>>> n = 2

>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

map

>>> s = 'abcde'
>>> n = 2

>>> list(map(''.join, zip(*[s[i:] for i in range(n)])))
['ab', 'bc', 'cd', 'de']

>>> list(map(str.upper, s))
['A', 'B', 'C', 'D', 'E']

>>> ''.join(map(str.upper, s))
'ABCDE'

Why not [].join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

Why not built-in join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

Why ''.join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

>>> ', '.join(s)
'a, b, c, d, e'

Why ''.join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

>>> ', '.join(s)
'a, b, c, d, e'

>>> list(map(''.join, zip(s, s[1:])))
['ab', 'bc', 'cd', 'de']

Memory usage

>>> s = 'abcde'
>>> n = 2

>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

Lazy evaluation

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

islice

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(islice(s, 1, None)) == list(s[1:None])
True

tee

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(map(next, tee(iter(s), 3)))
['a', 'a', 'a']

deque

>>> s = 'abcde'
>>> n = 2

>>> q = deque(maxlen=n)
>>> g = (q for x in map(q.append, s) if len(q) == n)
>>> list(map(''.join, g))
['ab', 'bc', 'cd', 'de']

Let's analyze Wikipedia!

def wikipedia(url):
    with urlopen(url) as f:
        f = bz2.open(f, 'rb')
        for event, elem in etree.iterparse(f):
            if elem.tag == 'text':
                yield from ngram(elem.text)
            elem.clear()

>>> url = ('https://dumps.wikimedia.org/ltwiki/latest'
           '/ltwiki-latest-pages-articles.xml.bz2')
>>> list(islice(wikipedia(url), 5))
['{{', '{p', 'po', 'or', 'rt']

We need a progress bar

def wikipedia(url):
    with urlopen(url) as f:
        f = bz2.open(f, 'rb')
        for event, elem in tqdm(etree.iterparse(f)):
            if elem.tag == 'text':
                yield from ngram(elem.text)
            elem.clear()

>>> list(islice(wikipedia(url), 5))
1it [00:00,  2.02it/s]
['{{', '{p', 'po', 'or', 'rt']

Object proxy

class TqdmProxy(wrapt.ObjectProxy):
    def __init__(self, wrapped, p):
        super().__init__(wrapped)
        self._self_p = p
    
    def read(self, amt=None):
        self._self_p.update(amt)
        return self.__wrapped__.read(amt)

We have a progress bar!

def wikipedia(url):
    with urlopen(url) as f:
        length = int(f.headers['Content-Length'])
        with tqdm(unit='B', unit_scale=True,
                  total=length) as p:
            f = bz2.open(TqdmProxy(f, p), 'rb')
            for event, elem in etree.iterparse(f):
                if elem.tag == 'text':
                    yield from ngram(elem.text)
                elem.clear()

>>> list(islice(wikipedia(url), 5))
0%|                  | 238K/150M [00:00<05:15, 474KB/s]
['{{', '{p', 'po', 'or', 'rt']

Markov Chain

counts = sorted(
    (a, b, c)
    for (a, b), c in Counter(wikipedia(url)).items()
)

groups = {
    k: tuple(zip(*g))[1:]
    for k, g in groupby(counts, key=itemgetter(0))
}

Machine learning

Entry = namedtuple('Entry', (
    'total',
    'cumsum',
    'letters',
))

model = {
    k: Entry(
        total=sum(counts),
        cumsum=list(accumulate(counts)),
        letters=dict(zip(accumulate(counts), letters)),
    ) for k, (letters, counts) in groups.items()
}

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

It can speak!

>>> print(*islice(speak(model), 512), sep='')

Alykymijo la tren Kava. Odilybūgien ham. Hizdiu, us giepuoly lainijotas U Vila kal Vių Brm f tas Ron rorėjo mišinyrto lijalatubo detylų suo turdasro FFCys O kan airentaire gytašos tenud įgisa Bopaf. SBo spas vate minijolios Mi ugria kių Junainisefrari!! kira Bių Vatas kdis fom.ch disije ceiniestous Sihtalą alos l varbrditiks. No Pa vų Atrakių ioblų marikusoldyd gaginus dia ga II aspym S. jainojį gitesejinripa, SubioVaiestvitantim. Bljos tence. Kastų Šarorim tulinanyvs taros gememiko i Radaigosmes Matair.

>>> print(*islice(speak(model), 30), sep='')

import antigravity

Slides

Jupyter notebook

n-gram

Python REPL

Duck typing (list)

Duck typing (tuple)

Duck typing (iterator)

Slicing

Range

List comprehension

List comprehension

List comprehension

It works!

Enter the zip!

Generators

Generators

Generators

Generators

Bigram using zip

n-gram using zip?

Argument unpacking

n-gram using zip!

map

Why not [].join?

Why not built-in join?

Why ''.join?

Why ''.join?

Memory usage

Lazy evaluation

islice

tee

deque

Let's analyze Wikipedia!

We need a progress bar

Object proxy

We have a progress bar!

Markov Chain

Machine learning

Lithuanina language generator

Lithuanina language generator

Lithuanina language generator

Lithuanina language generator

It can speak!

Thank you, for your attention.

pycon17