import antigravity
Mantas Zimnickas
PyConLT'17
Slides
Jupyter notebook
n-gram
Sequence: abcde
n-grams:
1-gram: a, b, c, d, e
2-gram: ab, bc, cd, de
3-gram: abc, bcd, cde
4-gram: abcd, bcde
5-gram: abcde
Python REPL
>>> s = 'abcde'
>>> n = 2
Duck typing (list)
>>> s = 'abcde'
>>> n = 2
>>> list(s)
['a', 'b', 'c', 'd', 'e']
Duck typing (tuple)
>>> s = 'abcde'
>>> n = 2
>>> list(s)
['a', 'b', 'c', 'd', 'e']
>>> tuple(s)
('a', 'b', 'c', 'd', 'e')
Duck typing (iterator)
>>> s = 'abcde'
>>> n = 2
>>> list(s)
['a', 'b', 'c', 'd', 'e']
>>> tuple(s)
('a', 'b', 'c', 'd', 'e')
>>> iter(s)
<str_iterator at 0x7f3f59c0f908>
Slicing
>>> s = 'abcde'
>>> n = 2
>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
[i:j)
[i:j)
Range
>>> s = 'abcde'
>>> n = 2
>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
>>> list(range(len(s)))
[0, 1, 2, 3, 4]
[i, j)
[i,j)
List comprehension
>>> s = 'abcde'
>>> n = 2
>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
>>> [(i, i + n) for i in range(len(s))]
[(0, 2), (1, 3), (2, 4), (3, 5), (4, 6)]
List comprehension
>>> s = 'abcde'
>>> n = 2
>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
>>> [(i, i + n) for i in range(len(s) - n)]
[(0, 2), (1, 3), (2, 4)]
List comprehension
>>> s = 'abcde'
>>> n = 2
>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
>>> [(i, i + n) for i in range(len(s) - n + 1)]
[(0, 2), (1, 3), (2, 4), (3, 5)]
It works!
>>> s = 'abcde'
>>> n = 2
>>> s[0:2], s[1:3], s[2:4], s[3:5]
('ab', 'bc', 'cd', 'de')
>>> [s[i:i + n] for i in range(len(s) - n + 1)]
['ab', 'bc', 'cd', 'de']
Enter the zip!
>>> s = 'abcde'
>>> n = 2
>>> zip(s, s[1:])
<zip at 0x7f18a48a5f48>
Generators
>>> s = 'abcde'
>>> n = 2
>>> g = zip(s, s[1:])
Generators
>>> s = 'abcde'
>>> n = 2
>>> g = zip(s, s[1:])
>>> next(g)
('a', 'b')
Generators
>>> s = 'abcde'
>>> n = 2
>>> g = zip(s, s[1:])
>>> next(g)
('a', 'b')
>>> next(g)
('b', 'c')
Generators
>>> s = 'abcde'
>>> n = 2
>>> g = zip(s, s[1:])
>>> next(g)
('a', 'b')
>>> next(g)
('b', 'c')
>>> list(g)
[('c', 'd'), ('d', 'e')]
Bigram using zip
>>> s = 'abcde'
>>> n = 2
>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
n-gram using zip?
>>> s = 'abcde'
>>> n = 2
>>> list(zip(s, s[1:]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
>>> list(zip(s, s[1:], s[2:]))
[('a', 'b', 'c'), ('b', 'c', 'd'), ('c', 'd', 'e')]
Argument unpacking
>>> s = 'abcde'
>>> n = 2
>>> print('a', 'b', 'c', 'd', 'e')
a b c d e
>>> print(*s)
a b c d e
>>> print(*range(len(s)))
0 1 2 3 4
n-gram using zip!
>>> s = 'abcde'
>>> n = 2
>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
map
>>> s = 'abcde'
>>> n = 2
>>> list(map(''.join, zip(*[s[i:] for i in range(n)])))
['ab', 'bc', 'cd', 'de']
>>> list(map(str.upper, s))
['A', 'B', 'C', 'D', 'E']
>>> ''.join(map(str.upper, s))
'ABCDE'
Why not [].join?
>>> s = 'abcde'
>>> n = 2
>>> s.join(', ')
',abcde '
Why not built-in join?
>>> s = 'abcde'
>>> n = 2
>>> s.join(', ')
',abcde '
>>> str.join(', ', s)
'a, b, c, d, e'
Why ''.join?
>>> s = 'abcde'
>>> n = 2
>>> s.join(', ')
',abcde '
>>> str.join(', ', s)
'a, b, c, d, e'
>>> ', '.join(s)
'a, b, c, d, e'
Why ''.join?
>>> s = 'abcde'
>>> n = 2
>>> s.join(', ')
',abcde '
>>> str.join(', ', s)
'a, b, c, d, e'
>>> ', '.join(s)
'a, b, c, d, e'
>>> list(map(''.join, zip(s, s[1:])))
['ab', 'bc', 'cd', 'de']
Memory usage
>>> s = 'abcde'
>>> n = 2
>>> list(zip(*[s[i:] for i in range(n)]))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
Lazy evaluation
>>> s = 'abcde'
>>> n = 2
>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
islice
>>> s = 'abcde'
>>> n = 2
>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
>>> list(islice(s, 1, None)) == list(s[1:None])
True
tee
>>> s = 'abcde'
>>> n = 2
>>> g = enumerate(tee(s, n))
>>> g = (islice(x, i, None) for i, x in g)
>>> list(zip(*g))
[('a', 'b'), ('b', 'c'), ('c', 'd'), ('d', 'e')]
>>> list(map(next, tee(iter(s), 3)))
['a', 'a', 'a']
deque
>>> s = 'abcde'
>>> n = 2
>>> q = deque(maxlen=n)
>>> g = (q for x in map(q.append, s) if len(q) == n)
>>> list(map(''.join, g))
['ab', 'bc', 'cd', 'de']
Let's analyze Wikipedia!
def wikipedia(url):
with urlopen(url) as f:
f = bz2.open(f, 'rb')
for event, elem in etree.iterparse(f):
if elem.tag == 'text':
yield from ngram(elem.text)
elem.clear()
>>> url = ('https://dumps.wikimedia.org/ltwiki/latest'
'/ltwiki-latest-pages-articles.xml.bz2')
>>> list(islice(wikipedia(url), 5))
['{{', '{p', 'po', 'or', 'rt']
We need a progress bar
def wikipedia(url):
with urlopen(url) as f:
f = bz2.open(f, 'rb')
for event, elem in tqdm(etree.iterparse(f)):
if elem.tag == 'text':
yield from ngram(elem.text)
elem.clear()
>>> list(islice(wikipedia(url), 5))
1it [00:00, 2.02it/s]
['{{', '{p', 'po', 'or', 'rt']
Object proxy
class TqdmProxy(wrapt.ObjectProxy):
def __init__(self, wrapped, p):
super().__init__(wrapped)
self._self_p = p
def read(self, amt=None):
self._self_p.update(amt)
return self.__wrapped__.read(amt)
We have a progress bar!
def wikipedia(url):
with urlopen(url) as f:
length = int(f.headers['Content-Length'])
with tqdm(unit='B', unit_scale=True,
total=length) as p:
f = bz2.open(TqdmProxy(f, p), 'rb')
for event, elem in etree.iterparse(f):
if elem.tag == 'text':
yield from ngram(elem.text)
elem.clear()
>>> list(islice(wikipedia(url), 5))
0%| | 238K/150M [00:00<05:15, 474KB/s]
['{{', '{p', 'po', 'or', 'rt']
Markov Chain
counts = sorted(
(a, b, c)
for (a, b), c in Counter(wikipedia(url)).items()
)
groups = {
k: tuple(zip(*g))[1:]
for k, g in groupby(counts, key=itemgetter(0))
}
Machine learning
Entry = namedtuple('Entry', (
'total',
'cumsum',
'letters',
))
model = {
k: Entry(
total=sum(counts),
cumsum=list(accumulate(counts)),
letters=dict(zip(accumulate(counts), letters)),
) for k, (letters, counts) in groups.items()
}
Lithuanina language generator
def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]
Lithuanina language generator
def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]
Lithuanina language generator
def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]
Lithuanina language generator
def speak(model, c='A'):
while True:
yield c
r = uniform(0, model[c].total)
i = next(x for x in model[c].cumsum if r <= x)
c = model[c].letters[i]
It can speak!
>>> print(*islice(speak(model), 512), sep='')
Alykymijo la tren Kava. Odilybūgien ham. Hizdiu, us giepuoly lainijotas U Vila kal Vių Brm f tas Ron rorėjo mišinyrto lijalatubo detylų suo turdasro FFCys O kan airentaire gytašos tenud įgisa Bopaf. SBo spas vate minijolios Mi ugria kių Junainisefrari!! kira Bių Vatas kdis fom.ch disije ceiniestous Sihtalą alos l varbrditiks. No Pa vų Atrakių ioblų marikusoldyd gaginus dia ga II aspym S. jainojį gitesejinripa, SubioVaiestvitantim. Bljos tence. Kastų Šarorim tulinanyvs taros gememiko i Radaigosmes Matair.
>>> print(*islice(speak(model), 30), sep='')
Thank you, for your attention.
pycon17
By Mantas Zimnickas
pycon17
- 1,690