import antigravity

Mantas Zimnickas

PyConLT'17

Slides

Jupyter notebook

n-gram

Sequence: abcde

 

n-grams:

    1-gram: a, b, c, d, e
    2-gram: ab, bc, cd, de
    3-gram: abc, bcd, cde
    4-gram: abcd, bcde
    5-gram: abcde

Python REPL

>>> s = 'abcde'
>>> n = 2

Duck typing (list)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

Duck typing (tuple)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

>>> tuple(s)
('a', 'b', 'c', 'd', 'e')

Duck typing (iterator)

>>> s = 'abcde'
>>> n = 2

>>> list(s)
['a', 'b', 'c', 'd', 'e']

>>> tuple(s)
('a', 'b', 'c', 'd', 'e')

>>> iter(s)
<str_iterator at 0x7f3f59c0f908>

Slicing

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
[i:j)
[i:j)[i:j)

Range

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> list(range(len(s)))
[0, 1, 2, 3, 4]
[i, j)
[i,j)[i, j)

List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s))]
[(0, 2), (1, 3), (2, 4), (3, 5), (4, 6)]
            

List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s) - n)]
[(0, 2), (1, 3), (2, 4)]

List comprehension

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [(i, i + n) for i in range(len(s) - n + 1)]
[(0, 2), (1, 3), (2, 4), (3, 5)]

It works!

>>> s = 'abcde'
>>> n = 2

>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')

>>> [s[i:i + n] for i in range(len(s) - n + 1)]
['ab', 'bc', 'cd', 'de']
            

Enter the zip!

>>> s = 'abcde'
>>> n = 2

>>> zip(s, s[1:])
<zip at 0x7f18a48a5f48>

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

>>> next(g)
('b', 'c')

Generators

>>> s = 'abcde'
>>> n = 2

>>> g = zip(s, s[1:])

>>> next(g)
('a', 'b')

>>> next(g)
('b', 'c')

>>> list(g)
[('c', 'd'), ('d', 'e')]

Bigram using zip

>>> s = 'abcde'
>>> n = 2

>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

n-gram using zip?

>>> s = 'abcde'
>>> n = 2

>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(zip(s, s[1:], s[2:]))
[('a', 'b', 'c'), ('b', 'c', 'd'), ('c', 'd', 'e')]

Argument unpacking

>>> s = 'abcde'
>>> n = 2

>>> print('a', 'b', 'c', 'd', 'e')
a b c d e

>>> print(*s)
a b c d e

>>> print(*range(len(s)))
0 1 2 3 4

n-gram using zip!

>>> s = 'abcde'
>>> n = 2

>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

map

>>> s = 'abcde'
>>> n = 2

>>> list(map(''.join, zip(*[s[i:] for i in range(n)])))
['ab', 'bc', 'cd', 'de']

>>> list(map(str.upper, s))
['A', 'B', 'C', 'D', 'E']

>>> ''.join(map(str.upper, s))
'ABCDE'

Why not [].join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

Why not built-in join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

Why ''.join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

>>> ', '.join(s)
'a, b, c, d, e'

Why ''.join?

>>> s = 'abcde'
>>> n = 2

>>> s.join(', ')
',abcde '

>>> str.join(', ', s)
'a, b, c, d, e'

>>> ', '.join(s)
'a, b, c, d, e'

>>> list(map(''.join, zip(s, s[1:])))
['ab', 'bc', 'cd', 'de']

Memory usage

>>> s = 'abcde'
>>> n = 2

>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

Lazy evaluation

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

islice

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(islice(s, 1, None)) == list(s[1:None])
True

tee

>>> s = 'abcde'
>>> n = 2

>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]

>>> list(map(next, tee(iter(s), 3)))
['a', 'a', 'a']

deque

>>> s = 'abcde'
>>> n = 2

>>> q = deque(maxlen=n)
>>> g = (q for x in map(q.append, s) if len(q) == n)
>>> list(map(''.join, g))
['ab', 'bc', 'cd', 'de']

Let's analyze Wikipedia!

def wikipedia(url):
    with urlopen(url) as f:
        f = bz2.open(f, 'rb')
        for event, elem in etree.iterparse(f):
            if elem.tag == 'text':
                yield from ngram(elem.text)
            elem.clear()

>>> url = ('https://dumps.wikimedia.org/ltwiki/latest'
           '/ltwiki-latest-pages-articles.xml.bz2')
>>> list(islice(wikipedia(url), 5))
['{{', '{p', 'po', 'or', 'rt']

We need a progress bar

def wikipedia(url):
    with urlopen(url) as f:
        f = bz2.open(f, 'rb')
        for event, elem in tqdm(etree.iterparse(f)):
            if elem.tag == 'text':
                yield from ngram(elem.text)
            elem.clear()

>>> list(islice(wikipedia(url), 5))
1it [00:00,  2.02it/s]
['{{', '{p', 'po', 'or', 'rt']

Object proxy

class TqdmProxy(wrapt.ObjectProxy):
    def __init__(self, wrapped, p):
        super().__init__(wrapped)
        self._self_p = p
    
    def read(self, amt=None):
        self._self_p.update(amt)
        return self.__wrapped__.read(amt)

We have a progress bar!

def wikipedia(url):
    with urlopen(url) as f:
        length = int(f.headers['Content-Length'])
        with tqdm(unit='B', unit_scale=True,
                  total=length) as p:
            f = bz2.open(TqdmProxy(f, p), 'rb')
            for event, elem in etree.iterparse(f):
                if elem.tag == 'text':
                    yield from ngram(elem.text)
                elem.clear()

>>> list(islice(wikipedia(url), 5))
0%|                  | 238K/150M [00:00<05:15, 474KB/s]
['{{', '{p', 'po', 'or', 'rt']
            

Markov Chain

counts = sorted(
    (a, b, c)
    for (a, b), c in Counter(wikipedia(url)).items()
)

groups = {
    k: tuple(zip(*g))[1:]
    for k, g in groupby(counts, key=itemgetter(0))
}

Machine learning

Entry = namedtuple('Entry', (
    'total',
    'cumsum',
    'letters',
))

model = {
    k: Entry(
        total=sum(counts),
        cumsum=list(accumulate(counts)),
        letters=dict(zip(accumulate(counts), letters)),
    ) for k, (letters, counts) in groups.items()
}

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

Lithuanina language generator

def speak(model, c='A'):
    while True:
        yield c
        r = uniform(0, model[c].total)
        i = next(x for x in model[c].cumsum if r <= x)
        c = model[c].letters[i]

It can speak!

>>> print(*islice(speak(model), 512), sep='')
Alykymijo la tren Kava. Odilybūgien ham. Hizdiu, us giepuoly lainijotas U Vila kal Vių Brm f tas Ron rorėjo mišinyrto lijalatubo detylų suo turdasro FFCys O kan airentaire gytašos tenud įgisa Bopaf. SBo spas vate minijolios Mi ugria kių Junainisefrari!! kira Bių Vatas kdis fom.ch disije ceiniestous Sihtalą alos l varbrditiks. No Pa vų Atrakių ioblų marikusoldyd gaginus dia ga II aspym S. jainojį gitesejinripa, SubioVaiestvitantim. Bljos tence. Kastų Šarorim tulinanyvs taros gememiko i Radaigosmes Matair.
>>> print(*islice(speak(model), 30), sep='')

Thank you, for your attention.

pycon17

By Mantas Zimnickas