Add pdfgrep.sh script
authorColin McCabe <cmccabe@alumni.cmu.edu>
Sun, 13 Dec 2009 23:35:04 +0000 (15:35 -0800)
committerColin McCabe <cmccabe@alumni.cmu.edu>
Sun, 13 Dec 2009 23:35:04 +0000 (15:35 -0800)
pdfgrep.sh [new file with mode: 0755]

diff --git a/pdfgrep.sh b/pdfgrep.sh
new file mode 100755 (executable)
index 0000000..330eb92
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+#
+# pdfgrep
+#
+# A script for searching through PDFs to find some words.
+#
+# I guess that everyone knows that PDFs aren't greppable. However, if you use
+# this script, you can grep through PDFs for various key words, just as if
+# they were text.
+#
+# usage: pdfgrep [pattern] [PDFs to search ...]
+#
+# Colin McCabe
+#
+
+die() {
+    echo $1
+    exit 1
+}
+
+ispdf() {
+    echo ${1} | grep -q -i -E '.pdf$'
+}
+
+which pdftotext &> /dev/null || die "you must have pdftotext installed"
+which fold &> /dev/null || die "you must have fold installed"
+
+PATTERN=$1
+shift
+#echo "PATTERN=${PATTERN}"
+
+TMPDIR=`mktemp -d -t pdfgrep.XXXXXXXXXX` || exit 1
+trap "rm -rf ${TMPDIR}; exit" INT TERM EXIT
+
+for PDF in "$@"; do
+    if ispdf "${PDF}"; then
+        #echo "PDF = $PDF"
+        pdftotext "${PDF}" "${TMPDIR}/${PDF}.pre.txt"
+        fold -s -w 120 "${TMPDIR}/${PDF}.pre.txt" > "${TMPDIR}/${PDF}.txt"
+        grep --with-filename -i --color=always \
+            ${PATTERN} "${TMPDIR}/${PDF}.txt" | \
+            sed "s ^${TMPDIR}/  "
+        rm -f "${TMPDIR}/${PDF}.txt"
+    fi
+done
+
+exit 0