class Foo
def bar
puts "Foobar"
end
endPlease make sure you're sitting where you can read this slide
that man should not be expected in a few minutes,
and i was left drifting on a large and luxurious city called Paris,
surrounded by mountains of ice, which was to be the hellish triumph;
clapping my hands, and in the morning, however, lay scattered on the ground.
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
tokens = DOCUMENT.split
token_counts = tokens.tally
# => { 'the' => 2, 'cat' => 1, => 'sat' => 1, 'on' => 1, 'mat' => 1}
total_token_count = token_counts.values.sum
probability_distribution = token_counts.transform_values do |count|
count / total_token_count.to_f
end
# => { "the" => 0.3333333333333333, "cat" => 0.16666666666666666, ... }
def generate_next_token(probability_distribution)
probability_distribution.max_by(&:last).first
end
sequence = []
DEFAULT_SEQUENCE_LENGTH.times do
sequence.push(generate_next_token(probability_distribution))
end
puts sequence.join(" ")"the the the the the the the the the the"class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
def initialize
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
Array.new(sequence_length) { generate_next_token }.join(" ")
end
protected
def generate_next_token
@probability_distribution.max_by(&:last).first
end
def calculate_probability_distribution
token_counts = DOCUMENT.split.tally
total_token_count = token_counts.values.sum
token_counts.transform_values do |count|
count / total_token_count.to_f
end
end
end
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = Array.new(sequence_length) { generate_next_token }
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token
@probability_distribution.max_by(&:last).first
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
token_counts = tokens.tally
total_token_count = token_counts.values.sum
token_counts.transform_values do |count|
count / total_token_count.to_f
end
end
end
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = Array.new(sequence_length) { generate_next_token }
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token
@probability_distribution.max_by(&:last).first
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
token_counts = NGramCounter.new(tokens: tokens).ngram_counts
total_token_count = token_counts.values.sum
token_counts.transform_values do |count|
count / total_token_count.to_f
end
end
endclass LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = Array.new(sequence_length) { generate_next_token }
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token
@probability_distribution.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
token_counts = NGramCounter.new(tokens: tokens).ngram_counts
ProbabilityDistribution.new(ngram_counts: token_counts).distribution
end
end"the the the the the the the the the the"the cat sat on the mat
50%
50%
the cat sat on the mat
100%
the cat sat on the mat
"the" => { "cat" => 50%, "mat" => 50% }
"cat" => { "sat" => 100% }
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = Array.new(sequence_length) { generate_next_token }
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token
@probability_distribution.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
token_counts = NGramCounter.new(tokens: tokens).ngram_counts
ProbabilityDistribution.new(ngram_counts: token_counts).distribution
end
endclass LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 2
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = ["the"]
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last)
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 2
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = ["the"]
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last)
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class ProbabilityDistribution
TokenProbability = Data.define(:token, :probability)
def initialize(ngram_counts: {})
@ngram_counts = ngram_counts
end
def distribution
return @distribution if defined?(@distribution)
total = @ngram_counts.values.sum
@distribution = @ngram_counts.map do |token, count|
TokenProbability[token, count / total.to_f]
end
end
end
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 2
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = ["the"]
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last)
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class ProbabilityDistribution
TokenProbability = Data.define(:token, :probability)
def initialize(ngram_counts: {})
@ngram_counts = ngram_counts
end
def [](context)
distribution.fetch(context, [])
end
def distribution
return @distribution if defined?(@distribution)
@distribution = @ngram_counts.to_h do |context, target_counts|
total = target_counts.values.sum
target_probabilities = target_counts.map do |token, count|
TokenProbability[token, count / total.to_f]
end
[context, target_probabilities]
end
end
end
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 2
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = ["the"]
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last)
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
# BEFORE
#
token: "the", probability: 0.3333333333333333,
token:"cat", probability: 0.16666666666666666,
token:"sat", probability: 0.16666666666666666,
...
# AFTER
#
"the"
token:"cat", probability: 0.5
token:"mat", probability: 0.5
"cat"
token:"sat", probability: 1.0
"sat"
token:"on", probability: 1.0
...class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 2
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the")
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last)
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
"the cat sat on the cat sat on the cat sat""the cat sat on the cat sat on the cat sat"class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the cat")
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last(N - 1))
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class LanguageModel
DOCUMENT = "the cat sat on the mat"
DEFAULT_SEQUENCE_LENGTH = 10
N = 2
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = ["the"]
Array.new(sequence_length) do
next_token = generate_next_token(context: sequence.last)
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(DOCUMENT)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
# BEFORE
#
"the"
token:"cat", probability: 0.5
token:"mat", probability: 0.5
"cat"
token:"sat", probability: 1.0
"sat"
token:"on", probability: 1.0
...
# After
#
["the", "cat"]
token:"sat", probability: 1.0
["cat", "sat"]
token:"on", probability: 1.0
["sat", "on"]
token:"the", probability: 1.0
..."the cat sat on the mat '' '' '' ''""the cat sat on the mat '' '' '' ''"class LanguageModel
DOCUMENT = "BOS the cat sat on the mat EOS"
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("BOS the")
until sequence.length >= sequence_length
break if sequence.last == "EOS"
next_token = generate_next_token(context: sequence.last(N - 1))
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return "EOS" if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
document = DOCUMENT
tokens = @tokenizer.tokenize(document)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end"the cat sat on the mat"class LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the")[0..-2]
until sequence.length >= sequence_length
break if sequence.last == Tokenizer::EOS
next_token = generate_next_token(context: sequence.last(N - 1))
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return Tokenizer::EOS if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the")[0..-2]
until sequence.last == Tokenizer::EOS
break if sequence.length >= sequence_length
next_token = generate_next_token(context: sequence.last(N - 1))
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return Tokenizer::EOS if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass Tokenizer
BOS = "BOS"
EOS = "EOS"
def tokenize(*samples)
samples.flat_map do |sample|
"#{bos_token} #{sample.to_s.downcase} #{eos_token}".split
end
end
def bos_token = BOS
def eos_token = EOS
def detokenize(tokens)
tokens.delete(bos_token)
tokens.delete(eos_token)
tokens.join(" ")
end
endclass LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the")[0..-2]
until sequence.last == Tokenizer::EOS
break if sequence.length >= sequence_length
next_token = generate_next_token(context: sequence.last(N - 1))
sequence << next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return Tokenizer::EOS if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass Document
attr_reader :samples
def initialize
@samples = [
"The cat sat on the mat"
]
end
endBreaking text down into numbers to represent words and sub-word units
class LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the")[0..-2]
until sequence.length >= sequence_length
break if sequence.last == @tokenizer.eos_token
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize("the")[0..-2]
until sequence.length >= sequence_length
break if sequence.last == @tokenizer.eos_token
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass Tokenizer
require "pycall/import"
include PyCall::Import
BOS = "!!!"
EOS = " ``"
def initialize(encoding: "cl100k_base")
pyimport :tiktoken
@encoder = tiktoken.get_encoding(encoding)
end
def bos_token = @encoder.encode(BOS).first
def eos_token = @encoder.encode(EOS).first
def tokenize(*samples)
text = samples.map { |s| "#{BOS} #{s.downcase.strip}#{EOS}" }.join
Array(@encoder.encode(text))
end
def detokenize(tokens)
tokens.delete(bos_token)
tokens.delete(eos_token)
@encoder.decode(tokens)
end
end"the cat sat on the mat""where did the dog sit?"class LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.length >= sequence_length
break if sequence.last == @tokenizer.eos_token
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end$ "the cat"
# => "the cat sat on the mat"
$ "sat on"
# => "sat on the mat"
$ "i want to"
# => "i want to"class LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.length >= sequence_length
break if sequence.last == @tokenizer.eos_token
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.length >= sequence_length
break if sequence.last == @tokenizer.eos_token
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass Document
IGNORED_PUNCTUATION_REGEXP = /(\[|\]"|“|”|’|\r?\n)/
WORD_REGEX = /
(?:
[[:alnum:]]+
(?:['-][[:alnum:]]+)*
)
|
(?:[.!])
|
(?:[,;])
|
(?:\s+)
/x
attr_reader :samples
def initialize(name = "simple_text")
@samples = File.readlines("documents/#{name}.txt").lazy.map do |line|
line.gsub!(IGNORED_PUNCTUATION_REGEXP, "")
line.strip!
line.scan(WORD_REGEX).join
end.reject(&:empty?)
end
endclass LanguageModel
DEFAULT_SEQUENCE_LENGTH = 10
N = 3
def initialize
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.length >= sequence_length
break if sequence.last == @tokenizer.eos_token
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
candidates.max_by(&:probability).token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*Document.new('frankenstein_text').samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end$ "in my pain, I felt"
# => "in my pain, i felt the greatest attention"
$ "i want to walk"
# => "i want to walk in fear and agitation."Applying a degree of randomness when choosing the next token, instead of always picking the most likely one.
class LanguageModel
DEFAULT_SEQUENCE_LENGTH = (ARGV[1] || 10).to_i
N = 3
def initialize
@document = Document.new("frankenstein_text")
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.last == @tokenizer.eos_token
break if sequence.length >= sequence_length
next_token = generate_next_token(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def generate_next_token(context:)
candidates = @probability_distribution[context]
return @tokenizer.eos_token if Array(candidates).empty?
total = candidates.sum(&:probability)
pick = rand * total
cumulative = 0.0
candidates.each do |tp|
cumulative += tp.probability
return tp.token if cumulative >= pick
end
candidates.last.token
end
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*@document.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
endclass LanguageModel
DEFAULT_SEQUENCE_LENGTH = (ARGV[1] || 10).to_i
N = 3
def initialize
@document = Document.new("frankenstein_text")
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
@next_token_generator = NextTokenGenerator.new(
probability_distribution: @probability_distribution,
eos_token: @tokenizer.eos_token,
n: N
)
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.last == @tokenizer.eos_token
break if sequence.length >= sequence_length
next_token = @next_token_generator.generate(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*@document.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class LanguageModel
DEFAULT_SEQUENCE_LENGTH = (ARGV[1] || 10).to_i
N = 3
def initialize
@document = Document.new("frankenstein_text")
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
@next_token_generator = NextTokenGenerator.new(
probability_distribution: @probability_distribution,
eos_token: @tokenizer.eos_token,
n: N
)
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.last == @tokenizer.eos_token
break if sequence.length >= sequence_length
next_token = @next_token_generator.generate(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*@document.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class NextTokenGenerator
def initialize(probability_distribution:, eos_token:, n:)
@probability_distribution = probability_distribution
@eos_token = eos_token
@n = n
end
def generate(context:)
candidates = @probability_distribution[context]
return @eos_token if Array(candidates).empty?
pick = rand
cumulative = 0.0
candidates.each do |c|
cumulative += c.probability
return c.token if pick <= cumulative
end
candidates.last.token
end
endRandomly selecting, based on the assigned probability of each item
Adjusting the probability weights to be more or less pronounced
class LanguageModel
DEFAULT_SEQUENCE_LENGTH = (ARGV[1] || 10).to_i
TEMPERATURE = (ARGV[2] || 1.0).to_f
N = 3
def initialize(temperature: TEMPERATURE)
@document = Document.new("frankenstein_text")
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
@next_token_generator = NextTokenGenerator.new(
probability_distribution: @probability_distribution,
eos_token: @tokenizer.eos_token,
temperature: temperature,
n: N
)
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.last == @tokenizer.eos_token
break if sequence.length >= sequence_length
next_token = @next_token_generator.generate(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*@document.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class LanguageModel
DEFAULT_SEQUENCE_LENGTH = (ARGV[1] || 10).to_i
TEMPERATURE = (ARGV[2] || 1.0).to_f
N = 3
def initialize(temperature: TEMPERATURE)
@document = Document.new("frankenstein_text")
@tokenizer = Tokenizer.new
@probability_distribution = calculate_probability_distribution
@next_token_generator = NextTokenGenerator.new(
probability_distribution: @probability_distribution,
eos_token: @tokenizer.eos_token,
temperature: temperature,
n: N
)
end
def generate(prompt: ARGV[0], sequence_length: DEFAULT_SEQUENCE_LENGTH)
sequence = @tokenizer.tokenize(prompt)[0..-2]
until sequence.last == @tokenizer.eos_token
break if sequence.length >= sequence_length
next_token = @next_token_generator.generate(context: sequence.last(N - 1))
sequence.push next_token
end
@tokenizer.detokenize(sequence)
end
protected
def calculate_probability_distribution
tokens = @tokenizer.tokenize(*@document.samples)
counts = NGramCounter.new(tokens: tokens, n: N).ngram_counts
ProbabilityDistribution.new(ngram_counts: counts).distribution
end
end
class NextTokenGenerator
def initialize(probability_distribution:, eos_token:, temperature:, n:)
raise ArgumentError, "temperature must be > 0" unless temperature.positive?
@probability_distribution = probability_distribution
@eos_token = eos_token
@temperature = temperature
@n = n
end
def generate(context:)
candidates = @probability_distribution[context]
return @eos_token if Array(candidates).empty?
scaled = candidates.to_h do |c|
scaled_value = Math.exp(Math.log(c.probability) / @temperature)
[c.token, scaled_value]
end
total = scaled.values.sum
normalized = scaled.transform_values { |v| v / total }
pick = rand
cumulative = 0.0
normalized.each do |token, prob|
cumulative += prob
return token if pick <= cumulative
end
candidates.last.token
end
endToken B
Token A
25%
75%
1.0
Token B
Token A
1%
99%
0.5
Token B
Token A
33.6%
63.3%
2
$ "and as I walked" (high temp)
# => "and as i walked on, while, with"
$ "and as I walked" (low temp)
# => "and as i walked up and down the passages"
"the cat sat on"
The cat sat on the coffin of henry
"the cat sat on"
The cat sat on the point of rest; we are enemies
"I pray that one day"
i pray that one day, but i was, indeed, i was unable to solve these questions continually recurred to the house of mourning, and the well, and i was a little uneasy that they would be disgusted, until i had been the favourite dream of pleasure
"the cat sat on the mat"279
8415
7731
389
5634
the
cat
sat
on
mat
| d1 | d2 | d3 | d4 | ... |
|---|---|---|---|---|
| 0.04 | -0.25 | 0.24 | 0.831 | ... |
| 0.98 | 0.7 | -0.85 | 0.4 | ... |
| 0.24 | -0.124 | -0.3 | 0.341 | ... |
| -0.45 | 0.2 | 0.12 | 0.91 | ... |
| -0.56 | 0.45 | 0.24 | 0.831 | ... |
279
8415
7731
389
5634
the
cat
sat
on
mat
rgb(133, 250, 220)
rgb(107, 224, 196)
rgb(122, 5, 35)
| d1 | d2 | d3 | d4 | ... |
|---|---|---|---|---|
| 0.04 | -0.25 | 0.24 | 0.831 | ... |
| 0.98 | 0.7 | -0.85 | 0.4 | ... |
| 0.24 | -0.124 | -0.3 | 0.341 | ... |
| -0.45 | 0.2 | 0.12 | 0.91 | ... |
| -0.56 | 0.45 | 0.24 | 0.831 | ... |
279
8415
7731
389
5634
the
cat
sat
on
mat
| d1 | d2 | d3 | d4 | ... |
|---|---|---|---|---|
| 0.04 | -0.25 | 0.24 | 0.831 | ... |
| 0.98 | 0.7 | -0.85 | 0.4 | ... |
| 0.24 | -0.124 | -0.3 | 0.341 | ... |
| -0.45 | 0.2 | 0.12 | 0.91 | ... |
| -0.56 | 0.45 | 0.24 | 0.831 | ... |
279
8415
7731
389
5634
the
cat
sat
on
mat
Token Vector
Hidden state
&
Embedding matrix
Output weight matrix
Embedding matrix
Output weight matrix
Token Vector
Hidden state
&
| 0.78 | -0.23 | 0.45 | 0.33 | -0.34 |
|---|
Hidden state
Token Vector
Hidden state
&
Embedding matrix
Output weight matrix
EMBEDDINGS = {
"A" => [0.1, 0.5, 0.9],
"B" => [0.3, 0.7, 0.2],
"C" => [0.9, 0.1, 0.4],
}
W_OUTPUT = [
[0.2, 0.5, 0.3],
[0.1, 0.4, 0.2],
[0.7, 0.6, 0.5],
]
hidden_state = [0.0, 0.0, 0.0]
sequence = ["A"]
100.times do
current_token = sequence.last
next_token = get_next_token(current_token, hidden_state)
sequence << next_token
break if next_token == EOS
end
puts sequence.join(' ')
| 0.78 | -0.23 | 0.45 | 0.33 | -0.34 |
|---|
Hidden state
"he watched as the cat sat on the mat"Attention means processing the entire context at once to determine which tokens are most relevant to the current position.
The model then uses this relevance map to weight information from the context before generating probabilities for the next word.
Attention means processing the entire context at once to determine which tokens are most relevant to the current position.
The model then uses this relevance map to weight information from the context before generating probabilities for the next word.
What does attention mean?
Attention means processing the entire context at once to determine which tokens are most relevant to the current position.
The model then uses this relevance map to weight information from the context before generating probabilities for the next word.
What does attention mean?
Attention means processing the entire context at once to determine which tokens are most relevant to the current position.
The model then uses this relevance map to weight information from the context before generating probabilities for the next word.
What does attention mean?
Token
Query Vector
Key Vector
Value Vector
K
Q
V
the
cat
sat
on
the
mat
K
Q
V
K
K
K
K
K
V
V
V
V
V
x0.6
x0.2
x0.6
x0.7
x0.1
x0.4
x0.5
the
cat
sat
on
the
mat
K
Q
V
K
K
K
K
K
V
V
V
V
V
V
context vector
Attention head 2
Attention head 1
Attention head 3
Token probabilities
| the | 0.01 |
| cat | 0.93 |
| sat | 0.01 |
| on | 0.01 |
| mat | 0.04 |
current token "the"
tokens.each do |token_a|
value = Vector.zero(5) # 5 dimensions
query = get_query(token_a)
tokens.each do |token_b|
next if token_a == token_b
key = get_key(token_b)
value += get_value(token_b) * similarity(query, key)
end
end
Information is not knowledge.
Knowledge is not wisdom.
Wisdom is not truth.