X-Git-Url: http://www.club.cc.cmu.edu/~cmccabe/cgi-bin/gitweb.cgi?p=cmccabe-bin;a=blobdiff_plain;f=pdfgrep.sh;fp=pdfgrep.sh;h=330eb925e9b2e94d3556ed68bf17cf9b56b0c499;hp=0000000000000000000000000000000000000000;hb=cfa6933a81a06e229611c39e7bce88e73319db7e;hpb=e3495d89b7c14f67b2bf7b98538ec65b76938355 diff --git a/pdfgrep.sh b/pdfgrep.sh new file mode 100755 index 0000000..330eb92 --- /dev/null +++ b/pdfgrep.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# +# pdfgrep +# +# A script for searching through PDFs to find some words. +# +# I guess that everyone knows that PDFs aren't greppable. However, if you use +# this script, you can grep through PDFs for various key words, just as if +# they were text. +# +# usage: pdfgrep [pattern] [PDFs to search ...] +# +# Colin McCabe +# + +die() { + echo $1 + exit 1 +} + +ispdf() { + echo ${1} | grep -q -i -E '.pdf$' +} + +which pdftotext &> /dev/null || die "you must have pdftotext installed" +which fold &> /dev/null || die "you must have fold installed" + +PATTERN=$1 +shift +#echo "PATTERN=${PATTERN}" + +TMPDIR=`mktemp -d -t pdfgrep.XXXXXXXXXX` || exit 1 +trap "rm -rf ${TMPDIR}; exit" INT TERM EXIT + +for PDF in "$@"; do + if ispdf "${PDF}"; then + #echo "PDF = $PDF" + pdftotext "${PDF}" "${TMPDIR}/${PDF}.pre.txt" + fold -s -w 120 "${TMPDIR}/${PDF}.pre.txt" > "${TMPDIR}/${PDF}.txt" + grep --with-filename -i --color=always \ + ${PATTERN} "${TMPDIR}/${PDF}.txt" | \ + sed "s ^${TMPDIR}/ " + rm -f "${TMPDIR}/${PDF}.txt" + fi +done + +exit 0