skip to content

awk / gawk β€” Text Processing

Pattern-action language for structured text. Field splitting, built-in variables, arithmetic, string functions, arrays, BEGIN/END blocks, and practical data-processing recipes.

5 min read 12 snippets 2d ago deep dive

awk / gawk β€” Text Processing#

Syntax#

awk [OPTIONS] 'PROGRAM' [FILE...]
awk [OPTIONS] -f script.awk [FILE...]
awk -v VAR=value 'PROGRAM' [FILE...]

A program is a series of pattern { action } rules. Both are optional:

  • No pattern β†’ action runs on every line
  • No action β†’ default is { print } (prints the matching line)

Built-in variables#

VariableMeaning
$0Entire current record (line)
$1 … $NFFields 1 through NF
NFNumber of fields in current record
NRTotal records read so far
FNRRecord number within current file
FSInput field separator (default: whitespace)
OFSOutput field separator (default: space)
RSInput record separator (default: \n)
ORSOutput record separator (default: \n)
FILENAMECurrent input file name
ARGC / ARGVArgument count / array

BEGIN and END blocks#

BEGIN { FS=","; OFS="\t" }   # run before any input
{ print $2, $1 }              # run per record
END   { print "Total:", NR }  # run after all input

Field separator#

awk -F: '{print $1}' /etc/passwd          # colon-separated
awk -F'\t' '{print $3}' data.tsv          # tab-separated
awk -F', *' '{print $2}' file             # comma + optional spaces
awk 'BEGIN{FS="|"} {print $1}' pipe.txt   # pipe character
awk -F'[,;]' '{print $1, $2}' file        # regex separator

Patterns#

awk '/error/'             log      # print lines matching regex
awk '!/^#/'              config    # skip comment lines
awk 'NR==1'              file      # first line only
awk 'NR>=10 && NR<=20'  file      # lines 10–20
awk '$3 > 100'           data      # field comparison
awk '$1 ~ /^foo/'        file      # field matches regex
awk '/START/,/END/'      file      # range: START to END (inclusive)

Printf and output#

awk '{printf "%-20s %5d\n", $1, $2}' file   # formatted output
awk '{print $2 > "out.txt"}'         file   # redirect to file
awk '{print $1 >> "append.txt"}'     file   # append
awk '{print | "sort -rn"}'           file   # pipe to command

String functions#

FunctionDescription
length(s)Length of string (or $0 if no arg)
substr(s, i, n)Substring from index i (1-based), length n
index(s, t)First position of t in s (0 = not found)
split(s, a, sep)Split s into array a using sep
sub(r, s, t)Replace first regex r match in t with s
gsub(r, s, t)Replace all regex r matches in t with s
match(s, r)Sets RSTART, RLENGTH; returns position or 0
sprintf(fmt, ...)Format string (like printf, returns string)
tolower(s)Lowercase
toupper(s)Uppercase
gensub(r, s, h, t)gawk: replace with \1 groups, h=β€œg” for global
awk '{print toupper($1), length($0)}' file
awk '{gsub(/foo/, "bar"); print}'     file        # replace in $0
awk '{sub(/^[ \t]+/, ""); print}'     file        # ltrim
awk '{gsub(/[ \t]+$/, ""); print}'    file        # rtrim
awk 'match($0, /[0-9]+/) {print substr($0, RSTART, RLENGTH)}' file

Numeric functions (gawk)#

awk '{print int($1), sqrt($2), $3^2}' data
awk 'BEGIN{srand()} {print int(rand()*100)}' /dev/stdin
awk '{printf "%.2f\n", $1/$2}' nums

Arrays#

# Frequency count
awk '{count[$1]++} END {for (k in count) print k, count[k]}' file

# Associative array from CSV: id β†’ name
awk -F, 'NR>1 {map[$1]=$2} END {for (id in map) print id, map[id]}' data.csv

# Delete element
awk '{delete seen[$1]; seen[$1]=$2}' file

# Array test
awk '$1 in seen {print "dup:", $1} {seen[$1]=1}' file

Multi-file processing#

# FNR vs NR
awk 'FNR==1 {print "--- File:", FILENAME}' file1 file2

# Process only second file
awk 'FNR==NR {ids[$1]=1; next} $1 in ids' list.txt data.txt

Practical recipes#

# Sum a column
awk '{sum+=$3} END {print sum}' data.txt

# Average
awk '{sum+=$1; n++} END {print sum/n}' numbers.txt

# Print columns in different order
awk '{print $3, $1, $2}' file.txt

# Skip header, process rest
awk 'NR>1 {print $2, $4}' report.csv

# Print unique lines (ordered, like sort|uniq)
awk '!seen[$0]++' file.txt

# Print duplicate lines only
awk 'seen[$0]++ == 1' file.txt

# Concatenate lines every N records
awk 'ORS= (NR%3 ? " " : "\n")' file    # join every 3 lines

# Extract value from key=value
awk -F= '/^timeout/{print $2}' config.ini

# Column-align a colon file
awk -F: '{printf "%-15s %-10s %s\n", $1,$3,$7}' /etc/passwd

# Top N by field
awk '{print $5, $0}' access.log | sort -rn | head -10 | cut -d' ' -f2-

# Running total
awk '{running+=$1; print running, $0}' ledger.txt

# Filter by date field (YYYY-MM-DD in $2)
awk '$2 >= "2025-01-01" && $2 <= "2025-03-31"' events.log

# Accumulate by group, then report
awk -F, '{bytes[$1]+=$3} END {
  for (h in bytes) printf "%s\t%.1f MB\n", h, bytes[h]/1048576
}' access.csv | sort -k2 -rn

# Transpose rows to columns
awk '{for(i=1;i<=NF;i++) col[i]=col[i] (NR>1?"\t":"") $i}
     END {for(i=1;i<=NF;i++) print col[i]}' matrix.txt

Multiline records#

# Blank-line-separated records (like paragraphs)
awk 'BEGIN{RS=""; FS="\n"} /keyword/{print $1}' file

# Multi-line CSV (quoted fields containing newlines) β€” use gawk
gawk 'BEGIN{FPAT="([^,]*)|(\"[^\"]+\")"} {print $2}' data.csv

One-liners reference#

awk 'END{print NR}' file              # count lines (wc -l)
awk '{print NF}'   file              # print field count per line
awk 'NF'           file              # remove blank lines
awk 'length>72'    file              # lines longer than 72 chars
awk '{$1=$1; print}' file            # collapse whitespace, trim
awk '{print $NF}'  file              # print last field
awk '{print $(NF-1)}' file           # print second-to-last field
awk 'NR%2==0'      file              # print even-numbered lines
awk 'NR==FNR{a[$0];next} $0 in a'  f1 f2  # intersection of two files
awk 'NR==FNR{a[$0];next} !($0 in a)' f1 f2 # lines in f2 not in f1

[!TIP] gawk (GNU awk) extends POSIX awk with gensub(), FPAT for CSV, nextfile, co-processes (|&), and more. On most Linux systems awk is already gawk; check with awk --version.