⚡️⚡️⚡️
Migrations at scale

⚡️⚡️⚡️

Ruby: each

class AddPublishedToPlayerMove < ActiveRecord::Migration
  def up
    add_column :player_moves, :published, :boolean
    PlayerMove.reviews.each(&:update_published).each &:save
  end

  def down
    remove_column :player_moves, :published
  end
end

Pro: easy to write and understand

Con: *very* slow

Ruby: find_each

class AddPublishedToPlayerMove < ActiveRecord::Migration
  def up
    add_column :player_moves, :published, :boolean
    PlayerMove.reviews.find_each do |p|
      p.update_published
      p.save
    end
  end

  def down
    remove_column :player_moves, :published
  end
end

Pro: won't consume all your memory

Ruby: find_in_batches

class AddPublishedToPlayerMove < ActiveRecord::Migration
  def up
    add_column :player_moves, :published, :boolean
    PlayerMove.reviews.find_in_batches do |batch|
      batch.each do |p|
        p.update_published
        p.save
      end
      GC.start # ✨
    end
  end

  def down
    remove_column :player_moves, :published
  end
end

Pro: allows you to manually run garbage collection

How slow are we talking?

If you have over 100k records in the table, you need to worry about performance.

   2 ms for database read

+ 3 ms for Ruby

+ 2 ms for database write

= 7 ms per record

* 100k records

= 12 minutes

Why does that matter?

Database locking!

Ruby: update_all

class AddPublishedToPlayerMove < ActiveRecord::Migration
  def up
    add_column :player_moves, :published, :boolean
    PlayerMove.reviews.update_all published: true
  end

  def down
    remove_column :player_moves, :published
  end
end

Pros: runs a single SQL UPDATE, can accept custom conditions

Cons: doesn't run callbacks and validations

Ruby: ActiveRecord Import

class CreateReviews < ActiveRecord::Migration
  def up
    create_table :reviews # ...

    file = File.expand_path('reviews.csv', __dir__)
    reviews = CSV.read(file, headers: true).map do |row|
      [row['content'], row['rating'], row['user_id']]
    end

    Review.import [:content, :rating, :user_id], reviews
  end

  def down
    drop_table :reviews
  end
end

Pro: easy to quickly import a ton of data without writing SQL yourself

Con: still creates a single query for each record imported

SQL

def up
  create_table :incentives do |t|
    t.integer    :amount, default: 0, null: false
    t.string     :status
    t.timestamp  :sent_at
    t.references :resident, null: false, index: true
    t.references :property, null: false, index: true
    t.timestamps
  end

  insert <<-SQL.squish
    insert into incentives
      (resident_id, property_id,           amount,           status,           sent_at)
      select    id, property_id, incentive_amount, incentive_status, incentive_sent_at
      from residents where incentive_amount > 0
  SQL

  remove_columns :residents, :incentive_amount, :incentive_status, :incentive_sent_at
end

Pros: blazingly fast, timing & total records changed logged to console

Cons: hard to understand, easy to mess up

SQL

# Removing duplicate records from a table with millions of rows
def up
  delete <<-SQL.squish
    delete from residents
    where id in (
      select id
      from (
        select id,
          row_number() over (partition by email, property_id order by id)
        from residents
      ) t
      where t.row_number > 1
    )
  SQL

  delete <<-SQL.squish
    delete from residents where email = 'undefined'
  SQL

  add_index :residents, [:property_id, :email], unique: true
end

Pro: can handle hard problems in seconds that would take a Ruby script hours / days to complete

What if it's still too slow?

saver = -> action {
  # ...
}

Action.order(:id).limit(100_000).each &saver
Action.order(:id).limit(100_000).offset(100_000).each &saver
Action.order(:id).limit(100_000).offset(200_000).each &saver
Action.order(:id).limit(100_000).offset(300_000).each &saver
Action.order(:id).limit(100_000).offset(400_000).each &saver
Action.order(:id).limit(100_000).offset(500_000).each &saver
Action.order(:id).limit(100_000).offset(600_000).each &saver

When modifying tens of millions of rows

  • create a temporary table
  • write to that from a background script
    • make sure it's fault-tolerant!
  • then replace the old table with the new table

SQL tips

Regular expressions!
JSON manipulation!
Common table expressions!
Window functions!

select count(*) from users where email ~* '^f|t$';

update player_moves set content = to_json(feedbacks)
from feedbacks where player_move_id = player_moves.id;

with counts as (
  select player_id,
    count(*) filter (where site = 'facebook') as facebook,
    count(*) filter (where site = 'google')   as google
  from friends
  group by player_id
)
update players set facebook_friends = facebook, google_friends = google
from counts where player_id = players.id;

SQL tips: clearing PaperTrail

# select count(*) from versions where item_type = 'Authentication'
#   and object_changes->>'s3_image' is not null;
#  count  
# --------
#  317816
#
# select count(*) from versions where item_type = 'Authentication'
#   and object_changes->>'s3_image' is not null
#   and object_changes - 's3_image' - 'updated_at' = '{}'::jsonb;
#  count
# --------
#  114834
delete <<-SQL.squish
  delete from versions where item_type = 'Authentication'
    and object_changes->>'s3_image' is not null
    and object_changes - 's3_image' - 'updated_at' = '{}'::jsonb
SQL
update <<-SQL.squish
  update versions
    set object_changes = object_changes - 's3_image'
  where item_type = 'Authentication'
    and object_changes->>'s3_image' is not null
SQL
update <<-SQL.squish
  update versions
    set object = object - 's3_image'
  where item_type = 'Authentication'
    and object->>'s3_image' is not null
SQL

Questions?

Migrations at scale

By seanlinsley

Migrations at scale

  • 600