How to build a compiler?

Things To Remember:

  • Compilers seem unapproachable. But that's not how it is.
  • We must attempt to learn about compilers.

Take-aways:

  • Understand the basic architecture of (almost) all the compilers
  • Be able to understand the code for any JS compiler and tools like ESLint, UglifyJS, PostCSS, et al
  • Be able to write your own compiler

Yatharth Khatri

Design Systems and Frontend Architect

Classical Pianist

GitHub:    yatharthk

Twitter:    yatharthkhatri

Why should I learn about compilers?

Because we use compilers

ALL THE TIME

We use languages that compile to JavaScript

And tools to improve our workflow and developer experience

Is understanding and writing compilers within my capacity?

What is a compiler?

// add two numbers
function add(a, b) {

  return a + b;

}
0 1 1 1 1 1 1 0

0 0 0 1 0 0 1 1

1 0 1 1 1 1 0 0 

1 1 1 1 0 0 1 1

0 0 1 1 0 0 0 1

Source Code

Machine Code

Trans-compiler / Transpiler

// add two numbers
const add = (a, b)
    => (a + b);
// add two numbers
function add(a, b) {

  return a + b;

}

ES6

(High Level Language)

ES5

(High Level Language)

A general compiler

Parsing
Transformation
Code Generation

Parsing

II.  Syntactic Analysis

I.     Lexical Analysis

Parsing

Lexical Analysis

Source Code

Tokens

Parsing

Syntactic Analysis

Tokens

Abstract Syntax Tree (AST)

Transformation

Transformation

AST (Source Code Language)

AST (Target Language)

Code Generation

Code Generation

AST (Target Language)

Print code in target language

Our Compiler

import module from "module"
const module = require("module")

Compiler sections

/* import module from 'module';  =>  var module = require('module'); */

function tokenizer(source) {
    // ...
    return tokens;
}

function parser(tokens) {
    // ...
    return ast;
}

function transformer(ast) {
    // ...
    return newAst;
}

function codeGenerator(ast) {
    // ...
    return code;
}

Tokens

[
    { type: "keyword", value: "import" },
    
    { type: "identifier", value: "module" },

    { type: "keyword", value: "from" },

    { type: "string", value: "module" }
]
import module from "module"

tokenizer

// Constants
const KEYWORDS = ['import', 'from'];
const WHITESPACE = /\s/;
const NEWLINE = /\n/;
const LETTERS = /[a-zA-Z]/;

function tokenizer(input) {
  let current = 0;
  const tokens = [];

  while (current < input.length) {
    let char = input[current];

    if (WHITESPACE.test(char) || NEWLINE.test(char) || char === ';') {
      current++;
      continue;
    }

    if (char === '"') {
      let value = '';

      char = input[++current];

      while (char !== '"') {
        value += char;
        char = input[++current];
      }

      tokens.push({ type: 'string', value: value });

      current++;
      continue;
    }
  (...)

tokenizer

function tokenizer(input) {
  let current = 0;
  const tokens = [];

  while (current < input.length) {
    let char = input[current];

    (...)
    
    if (LETTERS.test(char)) {
      let value = '';

      while (LETTERS.test(char)) {
        value += char;
        char = input[++current];
      }

      if (KEYWORDS.indexOf(value) > -1) {
        tokens.push({ type: 'keyword', value: value });
      } else {
        tokens.push({ type: 'identifier', value: value });
      }

      current++;
      continue;
    }

    throw new Error(`Unrecognized token: ${char}`);
  }

  return tokens;
}

Tokens

[
    { type: "keyword", value: "import" },
    
    { type: "identifier", value: "module" },

    { type: "keyword", value: "from" },

    { type: "string", value: "module" }
]
{
    type: "Program",

    body: [
        {
            type: "ImportDeclaration",
            specifier: {
                type: "Identifier",
                name: "module"
            },
            source: {
                type: "StringLiteral",
                value: "module"
            }
        }
    ]
}

Abstract Syntax Tree

parser

function parser(tokens) {
  let current = 0;

  function walk() {
    let token = tokens[current];

    if (token.type === 'string') {
      current++;

      return {
        type: 'StringLiteral',
        value: token.value
      };
    }

    if (token.type === 'identifier') {
      current++;

      return {
        type: 'Identifier',
        name: token.value
      };
    }

    (...)
  }

  (...)
}

parser

(...)
      if (token.type === 'keyword') {
        if (token.value === 'import') {
          current++;
          
          const node = {
            type: 'ImportDeclaration',
            source: null,
            specifier: null
          };

          if (tokens[current].type === 'identifier') {
            node.specifier = walk();
          } else {
            throw new Error(
              `Parse error: Unexpected ${
                tokens[current].value
              } after \`import\``
            );
          }

          // expect keyword `from` after the identifier.
          if (
            tokens[current].type === 'keyword' &&
            tokens[current].value === 'from'
          ) {
            current++;
          } else {
            throw new Error(
              `Parse Error: Unexpected token after ${node.specifier.name}`
            );
          }
(...)

parser

(...)

    function walk() {
      (...)

          if (tokens[current].type === 'string') {
            node.source = walk();
          } else {
            throw new Error(`Parse Error: Unexpected token after \`from\``);
          }

          return node;
        }
      }
        
      throw new Error(`Unrecognized token: ${token.value}`);
    }
  }

  const ast = {
    type: 'Program',
    body: []
  };

  while (current < tokens.length) {
    ast.body.push(walk());
  }

  return ast;
}

Source AST

{
    type: "Program",

    body: [
        {
            type: "ImportDeclaration",
            specifier: {
                type: "Identifier",
                name: "module"
            },
            source: {
                type: "StringLiteral",
                value: "module"
            }
        }
    ]
}
{
    type: "Program",

    body: [
        {
            type: "VariableDeclaration",
            kind: "var",
            id: {
                type: "Identifier",
                name: "module"
            },
            init: {
                type: "CallExpression",
                callee: {
                  type: "Identifier",
                  name: "require",
                }
                arguments: ["module"]
            }
        }
    ]
}

New AST

traverser

function traverser(ast, visitor) {
  function traverseArray(nodes) {
    nodes.forEach(traverseNode);
  }

  function traverseNode(node) {
    const visitorMethod = visitor[node.type];

    switch (node.type) {
      case 'Program':
        traverseArray(node.body);
        break;

      case 'StringLiteral':
      case 'Identifier':
        break;

      case 'ImportDeclaration':
        traverseNode(node.specifier);
        traverseNode(node.source);
        break;

      default:
        throw new TypeError(node.type);
    }

    if (visitorMethod) {
      visitorMethod(node);
    }
  }

  traverseNode(ast);
}
// VISITOR

{

  ImportDeclaration(node) {
    // transform node
  }

  StringLiteral(node) {
    // ...
  },

  StringLiteral(node) {
    // ...
  },

}

transformer

function transformer(ast) {
  traverser(ast, {
    ImportDeclaration(node) {
      let variableDeclaration = {
        type: 'VariableDeclaration',
        kind: 'var',
        id: node.specifier,
        init: undefined
      };

      let callExpression = {
        type: 'CallExpression',
        callee: {
          type: 'Identifier',
          name: 'require'
        },
        arguments: [node.source]
      };

      variableDeclaration.init = callExpression;

      // a hack, but that's okay :)
      delete node.specifier;
      delete node.source;
      Object.assign(node, variableDeclaration);
    }
  });

  return ast;
}

codeGenerator

function codeGenerator(node) {
  switch (node.type) {
    case 'Program':
      return node.body.map(codeGenerator).join('\n');

    case 'Identifier':
      return node.name;

    case 'StringLiteral': {
      return '"' + node.value + '"';
    }

    case 'VariableDeclaration': {
      return (
        node.kind + // var
        ' ' +
        codeGenerator(node.id) + // module,etc
        ' = ' +
        codeGenerator(node.init) // CallExpression
      );
    }

    case 'CallExpression': {
      return (
        codeGenerator(node.callee) + // require
        '(' +
        node.arguments.map(codeGenerator).join(', ') + // "module",etc
        ')' +
        ';'
      );
    }
  }
}

compiler

function compiler(input) {
  const tokens = tokenizer(input);
  const ast = parser(tokens);
  const transformedAst = transformer(ast);
  const output = codeGenerator(transformedAst);

  return output;
}

module.exports = {
  tokenizer,
  parser,
  transformer,
  codeGenerator,
  compiler
};

We just built a JavaScript compiler

yatharthk/micro-es6-import-compiler

GITHUB

I hope you did learn few good things today

How to build a compiler

By Yatharth K

How to build a compiler

Slides for a talk on "How to build a compiler?". And why we must attempt to learn about compilers.

  • 258