How to build a compiler?
Things To Remember:
- Compilers seem unapproachable. But that's not how it is.
- We must attempt to learn about compilers.
Take-aways:
- Understand the basic architecture of (almost) all the compilers
- Be able to understand the code for any JS compiler and tools like ESLint, UglifyJS, PostCSS, et al
- Be able to write your own compiler
Yatharth Khatri
Design Systems and Frontend Architect
Classical Pianist
GitHub: yatharthk
Twitter: yatharthkhatri
Why should I learn about compilers?
Because we use compilers
ALL THE TIME
We use languages that compile to JavaScript
And tools to improve our workflow and developer experience
Is understanding and writing compilers within my capacity?
What is a compiler?
// add two numbers
function add(a, b) {
return a + b;
}
0 1 1 1 1 1 1 0
0 0 0 1 0 0 1 1
1 0 1 1 1 1 0 0
1 1 1 1 0 0 1 1
0 0 1 1 0 0 0 1
Source Code
Machine Code
Trans-compiler / Transpiler
// add two numbers
const add = (a, b)
=> (a + b);
// add two numbers
function add(a, b) {
return a + b;
}
ES6
(High Level Language)
ES5
(High Level Language)
A general compiler
Parsing
Transformation
Code Generation
Parsing
II. Syntactic Analysis
I. Lexical Analysis
Parsing
Lexical Analysis
Source Code
Tokens
Parsing
Syntactic Analysis
Tokens
Abstract Syntax Tree (AST)
Transformation
Transformation
AST (Source Code Language)
AST (Target Language)
Code Generation
Code Generation
AST (Target Language)
Print code in target language
Our Compiler
import module from "module"
const module = require("module")
Compiler sections
/* import module from 'module'; => var module = require('module'); */
function tokenizer(source) {
// ...
return tokens;
}
function parser(tokens) {
// ...
return ast;
}
function transformer(ast) {
// ...
return newAst;
}
function codeGenerator(ast) {
// ...
return code;
}
Tokens
[
{ type: "keyword", value: "import" },
{ type: "identifier", value: "module" },
{ type: "keyword", value: "from" },
{ type: "string", value: "module" }
]
import module from "module"
tokenizer
// Constants
const KEYWORDS = ['import', 'from'];
const WHITESPACE = /\s/;
const NEWLINE = /\n/;
const LETTERS = /[a-zA-Z]/;
function tokenizer(input) {
let current = 0;
const tokens = [];
while (current < input.length) {
let char = input[current];
if (WHITESPACE.test(char) || NEWLINE.test(char) || char === ';') {
current++;
continue;
}
if (char === '"') {
let value = '';
char = input[++current];
while (char !== '"') {
value += char;
char = input[++current];
}
tokens.push({ type: 'string', value: value });
current++;
continue;
}
(...)
tokenizer
function tokenizer(input) {
let current = 0;
const tokens = [];
while (current < input.length) {
let char = input[current];
(...)
if (LETTERS.test(char)) {
let value = '';
while (LETTERS.test(char)) {
value += char;
char = input[++current];
}
if (KEYWORDS.indexOf(value) > -1) {
tokens.push({ type: 'keyword', value: value });
} else {
tokens.push({ type: 'identifier', value: value });
}
current++;
continue;
}
throw new Error(`Unrecognized token: ${char}`);
}
return tokens;
}
Tokens
[
{ type: "keyword", value: "import" },
{ type: "identifier", value: "module" },
{ type: "keyword", value: "from" },
{ type: "string", value: "module" }
]
{
type: "Program",
body: [
{
type: "ImportDeclaration",
specifier: {
type: "Identifier",
name: "module"
},
source: {
type: "StringLiteral",
value: "module"
}
}
]
}
Abstract Syntax Tree
parser
function parser(tokens) {
let current = 0;
function walk() {
let token = tokens[current];
if (token.type === 'string') {
current++;
return {
type: 'StringLiteral',
value: token.value
};
}
if (token.type === 'identifier') {
current++;
return {
type: 'Identifier',
name: token.value
};
}
(...)
}
(...)
}
parser
(...)
if (token.type === 'keyword') {
if (token.value === 'import') {
current++;
const node = {
type: 'ImportDeclaration',
source: null,
specifier: null
};
if (tokens[current].type === 'identifier') {
node.specifier = walk();
} else {
throw new Error(
`Parse error: Unexpected ${
tokens[current].value
} after \`import\``
);
}
// expect keyword `from` after the identifier.
if (
tokens[current].type === 'keyword' &&
tokens[current].value === 'from'
) {
current++;
} else {
throw new Error(
`Parse Error: Unexpected token after ${node.specifier.name}`
);
}
(...)
parser
(...)
function walk() {
(...)
if (tokens[current].type === 'string') {
node.source = walk();
} else {
throw new Error(`Parse Error: Unexpected token after \`from\``);
}
return node;
}
}
throw new Error(`Unrecognized token: ${token.value}`);
}
}
const ast = {
type: 'Program',
body: []
};
while (current < tokens.length) {
ast.body.push(walk());
}
return ast;
}
Source AST
{
type: "Program",
body: [
{
type: "ImportDeclaration",
specifier: {
type: "Identifier",
name: "module"
},
source: {
type: "StringLiteral",
value: "module"
}
}
]
}
{
type: "Program",
body: [
{
type: "VariableDeclaration",
kind: "var",
id: {
type: "Identifier",
name: "module"
},
init: {
type: "CallExpression",
callee: {
type: "Identifier",
name: "require",
}
arguments: ["module"]
}
}
]
}
New AST
traverser
function traverser(ast, visitor) {
function traverseArray(nodes) {
nodes.forEach(traverseNode);
}
function traverseNode(node) {
const visitorMethod = visitor[node.type];
switch (node.type) {
case 'Program':
traverseArray(node.body);
break;
case 'StringLiteral':
case 'Identifier':
break;
case 'ImportDeclaration':
traverseNode(node.specifier);
traverseNode(node.source);
break;
default:
throw new TypeError(node.type);
}
if (visitorMethod) {
visitorMethod(node);
}
}
traverseNode(ast);
}
// VISITOR
{
ImportDeclaration(node) {
// transform node
}
StringLiteral(node) {
// ...
},
StringLiteral(node) {
// ...
},
}
transformer
function transformer(ast) {
traverser(ast, {
ImportDeclaration(node) {
let variableDeclaration = {
type: 'VariableDeclaration',
kind: 'var',
id: node.specifier,
init: undefined
};
let callExpression = {
type: 'CallExpression',
callee: {
type: 'Identifier',
name: 'require'
},
arguments: [node.source]
};
variableDeclaration.init = callExpression;
// a hack, but that's okay :)
delete node.specifier;
delete node.source;
Object.assign(node, variableDeclaration);
}
});
return ast;
}
codeGenerator
function codeGenerator(node) {
switch (node.type) {
case 'Program':
return node.body.map(codeGenerator).join('\n');
case 'Identifier':
return node.name;
case 'StringLiteral': {
return '"' + node.value + '"';
}
case 'VariableDeclaration': {
return (
node.kind + // var
' ' +
codeGenerator(node.id) + // module,etc
' = ' +
codeGenerator(node.init) // CallExpression
);
}
case 'CallExpression': {
return (
codeGenerator(node.callee) + // require
'(' +
node.arguments.map(codeGenerator).join(', ') + // "module",etc
')' +
';'
);
}
}
}
compiler
function compiler(input) {
const tokens = tokenizer(input);
const ast = parser(tokens);
const transformedAst = transformer(ast);
const output = codeGenerator(transformedAst);
return output;
}
module.exports = {
tokenizer,
parser,
transformer,
codeGenerator,
compiler
};
We just built a JavaScript compiler
yatharthk/micro-es6-import-compiler
GITHUB
I hope you did learn few good things today
How to build a compiler
By Yatharth K
How to build a compiler
Slides for a talk on "How to build a compiler?". And why we must attempt to learn about compilers.
- 258