From 94ad9332c0a8213ad384007707d68ac8fd1810cb Mon Sep 17 00:00:00 2001 From: Tom Ryder Date: Tue, 8 Aug 2017 10:45:17 +1200 Subject: Nicer handling of RFC control chars --- bin/rfct.awk | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'bin') diff --git a/bin/rfct.awk b/bin/rfct.awk index 5ceef43f..230ac42c 100644 --- a/bin/rfct.awk +++ b/bin/rfct.awk @@ -6,9 +6,12 @@ BEGIN { ORS = "\n\n" } -# Skip paragraphs with ^L chars in them -# We have to be literal here due to mawk's failures -/ / { next } +# Skip paragraphs with ^L chars in them, as they likely contain headers and +# footers +/\f/ { next } -# If there's anything left, print it +# Strip out other control characters, but allow newline and tab +{ gsub(/[\a\b\r\v]/, "") } + +# If there's anything left after tha, print it length($0) -- cgit v1.2.3