Introduction to the CLI

Sed & AWK

Santiago Álvarez Rodríguez
Front-end Dev at PSL
santiaro90@gmail.com

Before getting started:

 

Learning sed and AWK is almost as learning a programming language.

 

I'm providing you only with basic commands, which can help you in most situations.

 

For more complex scenarios, refer to the manual, or the links at the end of this presentation.

Sed

Stands for Stream EDitor

Allows transforming an incoming input, one line at a time.

Transform how?

  • Text substitution
  • Delete lines matching patterns
  • Append text before/after the nth line
  • etc., etc., etc...

Show me some magic!

// File name: ~/Documents/server.js
// Dependencies
var express = require('express');
var bodyParser = require('body-parser');

// Default to port 3000
var PORT = process.env.PORT || 3000;
var app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

// Express middleware
console.debug('Injecting middleware');
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});
console.log('Finished injecting middleware');

app.get('/', function (req, res) {
  console.debug('Making a request to root url');
  // Body parser makes sure to send response as JSON
  res.send({message: 'Hello, express!'});
});

app.listen(PORT, function () {
  // To be shown after cranking server up
  console.info('Server listening on port', PORT);
});

Given this text file:

# '-r' is for using extended regular expressions
# Here we tell sed to match lines starting with comment
# characters and delete them
~/Documents $ sed -r '/^\s*\/\// d' server.js
var express = require('express');
var bodyParser = require('body-parser');

var PORT = process.env.PORT || 3000;
var app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

console.debug('Injecting middleware');
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});
console.log('Finished injecting middleware');

app.get('/', function (req, res) {
  console.debug('Making a request to root url');
  res.send({message: 'Hello, express!'});
});

app.listen(PORT, function () {
  console.info('Server listening on port', PORT);
});

Delete all commented out lines

# '-e' lets you chain multiple expressions
~/Documents $ sed -r -e '/^\s*\/\// d' -e '/console\.(debug|log)/ d' server.js
var express = require('express');
var bodyParser = require('body-parser');

var PORT = process.env.PORT || 3000;
var app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});

app.get('/', function (req, res) {
  res.send({message: 'Hello, express!'});
});

app.listen(PORT, function () {
  console.info('Server listening on port', PORT);
});

Don't want debug messages either

# 's /pattern/substitution/' replaces 'pattern' for 'substitution' in each line
~/Documents $ sed -r -e 's/var (.*)(,|=) (.*)/let \1\2 \3/' server.js
// File name: ~/Documents/server.js
// Dependencies
let express = require('express');
let bodyParser = require('body-parser');

// Default to port 3000
let PORT = process.env.PORT || 3000;
let app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

// Express middleware
console.debug('Injecting middleware');
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});
console.log('Finished injecting middleware');

# more text...

Substitute var for let

~/Documents $ sed -r -e 's/PORT/IN/' server.js
// File name: ~/Documents/server.js
// Dependencies
var express = require('express');
var bodyParser = require('body-parser');

// Default to port 3000
var IN = process.env.PORT || 3000;
var app = express();

if (!process.env.IN) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

// Express middleware
console.debug('Injecting middleware');
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});
console.log('Finished injecting middleware');

app.get('/', function (req, res) {
  console.debug('Making a request to root url');
  // Body parser makes sure to send response as JSON
  res.send({message: 'Hello, express!'});
});

app.listen(IN, function () {
  // To be shown after cranking server up
  console.info('Server listening on port', IN);
});

Substitute PORT for IN

?

's /pattern/substitution/' replaces ONLY the FIRST occurrence in the line... let's fix that!

# Notice the 'g' after the 's/pattern/substitution/' command
~/Documents $ sed -r -e 's/PORT/IN/ g' server.js
// File name: ~/Documents/server.js
// Dependencies
var express = require('express');
var bodyParser = require('body-parser');

// Default to port 3000
var IN = process.env.IN || 3000;
var app = express();

if (!process.env.IN) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

// Express middleware
console.debug('Injecting middleware');
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});
console.log('Finished injecting middleware');

# more text...

Substitute PORT for IN

# This time, we're telling sed which range of lines we
# want to operate over
~/Documents $ sed '1,5 c console.log("Hello, world!")' server.js
console.log("Hello, world!")
// Default to port 3000
var PORT = process.env.PORT || 3000;
var app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

// Express middleware
console.debug('Injecting middleware');
app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});
console.log('Finished injecting middleware');

# more text...

Change first 5 lines

# Mind the '\ ', so that 'date' doesn't fail
~/Documents $ sed "1 i // Modified on $(date +%Y-%M-%d\ %H:%m)" server.js
// Modified on 2016-14-25 21:09
// File name: ~/Documents/server.js
// Dependencies
var express = require('express');
var bodyParser = require('body-parser');

// Default to port 3000
var PORT = process.env.PORT || 3000;
var app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

# more text...

Insert text before the first line

~/Documents $ sed -r '/console\./ a \\' server.js
// File name: ~/Documents/server.js
// Dependencies
var express = require('express');
var bodyParser = require('body-parser');

// Default to port 3000
var PORT = process.env.PORT || 3000;
var app = express();

if (!process.env.PORT) {
  console.warn('Defaulting to port 3000');

  console.warn("It's better if you pass the server port in an ENV variable");

}

// Express middleware
console.debug('Injecting middleware');

app.use(bodyParser.json());
# more text...

Insert a blank after any line matching 'console.'

# '-i' to operate inline... that is, save changes to file
~/Documents $ sed -i -r -e '/^\s*\/\// d' -e '/console\.(debug|log)/ d' -e 's/PORT/IN/ g' server.js

# No output after hitting enter
~/Documents $ cat server.js
var express = require('express');
var bodyParser = require('body-parser');

var IN = process.env.IN || 3000;
var app = express();

if (!process.env.IN) {
  console.warn('Defaulting to port 3000');
  console.warn("It's better if you pass the server port in an ENV variable");
}

app.use(bodyParser.json());
app.use(bodyParser.urlencoded({extended: false});

app.get('/', function (req, res) {
  res.send({message: 'Hello, express!'});
});

app.listen(IN, function () {
  console.info('Server listening on port', IN);
});

Save all edits

AWK

Stands for...

Well, nothing too fancy, actually.

It was named after the initials of its creators: Alfred Aho, Peter Weinberger, Brian Kernighan

What is AWK useful for?

Text processing...

(bad) joking aside,  AWK lets you process row/column formatted input (like the one you see in the /etc/passwd file)...

where each row is supposed to be a record, and each column a field

Think of it as a mix of cut + tr + loads of steroids

Cool... and the examples?

# '-F' tells AWK how fields are separated within each record
~/Documents $ awk -F':' '{ print $1 $3 }' /etc/passwd
root0
bin1
daemon2
adm3
lp4
sync5
shutdown6
halt7
mail8
operator11
games12
ftp14
nobody99
avahi-autoipd170
systemd-bus-proxy999
systemd-network998
dbus81

# more output...

Print the username and UID from /etc/passwd

Username

UID

# OK, that didn't look quite good... fix it a bit
# Notice how the space is indicated
~/Documents $ awk -F':' '{ print $1 " " $3 }' /etc/passwd
root 0
bin 1
daemon 2
adm 3
lp 4
sync 5
shutdown 6
halt 7
mail 8
operator 11
games 12
ftp 14
nobody 99
avahi-autoipd 170
systemd-bus-proxy 999
systemd-network 998
dbus 81

# more output...

Spaces are the way you tell AWK to concatenate strings!

~/Documents $ awk -F':' '$3 > 500 { print $1 " " $3 }' /etc/passwd
systemd-bus-proxy 999
systemd-network 998
polkitd 997
santiaro90 1000
geoclue 996
unbound 995
openvpn 994
lightdm 993
setroubleshoot 992
nm-openvpn 991
nm-openconnect 990
vboxadd 989
test 1001

OK, do the same, but only when UID > 500

~/Documents $ awk -F':' '$3 > 500 { if(length($1) > 8) { print $1 " " $3 } }' /etc/passwd
systemd-bus-proxy 999
systemd-network 998
santiaro90 1000
setroubleshoot 992
nm-openvpn 991
nm-openconnect 990

Also, exclude names shorter than 8 characters

Nice, but that command's starting to become nasty... isn't it?

# Write this in a file... let's say ~/Documents/awk_tuto

$3 > 500 {
    if (length($1) > 8) {
        print $1 " " $3
    }
}

You can write an AWK script and read commands from there, too!

# Magic!!!...
~/Documents $ awk -F':' -f awk_tuto /etc/passwd
systemd-bus-proxy 999
systemd-network 998
santiaro90 1000
setroubleshoot 992
nm-openvpn 991
nm-openconnect 990

...then let AWK know your commands are to be read from that file:

BEGIN {
    print "We are adding a header"
    FS = ":"
}

$3 > 500 {
    if (length($1) > 8) {
        print $1 " " $3
    }
}

Let's change our file a little bit...

# Hey, we're not even telling what separator's being used... :)
~/Documents $ awk -f awk_tuto /etc/passwd
We are adding a header
systemd-bus-proxy 999
systemd-network 998
santiaro90 1000
setroubleshoot 992
nm-openvpn 991
nm-openconnect 990

...and then look what happens

The BEGIN block is executed before any line gets processed.

FS stands for Field Separator, and tells AWK... well, you know...

AWK provides a couple more variables like FS... Let's see some of them in action.

BEGIN {
    print "We are adding a header"
    FS = ":"
    OFS = "->"    # output field separator
}

# Skip first 15 records
# Process record only if UID > 500
$3 > 500 && NR > 15 {    # NR is current record index (starting from 1)
    print $1, $3    # use comma so OFS works nicely
}

# Yup, there's and END block as well... ;)
END {
    print "This is kinda a footer"
}

Edit the script one more time...

# Hey, we're not even telling what separator's being used... :)
~/Documents $ awk -f awk_tuto /etc/passwd
We are adding a header
systemd-network->998
polkitd->997
santiaro90->1000
geoclue->996
unbound->995
openvpn->994
lightdm->993
setroubleshoot->992
nm-openvpn->991
nm-openconnect->990
vboxadd->989
test->1001
This is kinda a footer

...and check out!

Some Links

Introduction to the CLI: Sed and AWK

By Santiago Álvarez Rodríguez

Introduction to the CLI: Sed and AWK

Using Sed and AWK for advanced text manipulation

  • 977