#!/bin/bash # # pat2pdf version 0.97 # Copyright (c)2000 Oren Tirosh # Released under the GPL # # This script connects to the USPTO patent database, retrieves the TIFF # patent images and converts them into a single pdf file using GhostScript. # # Yes, I know, you would have written it in perl/python/(insert your # favourite scripting language here) # # It requires an http fetcher (lynx by default), GhostScript # and tiff2ps (part of libtiff) # # For best results (small PDFs) use GhostScript 5.5 or higher. # #Usage: # pat2pdf # # Result is a file in the current directory named pat.pdf # #Bugs: # # Error checking and recovery could be better. # #Homepage: # http://www.tothink.com/pat2pdf # # Real name of the patent search site. For some reason the USPTO link to it # by IP address - are they trying to hide anything? SITENAME="http://pto.dwsearch.com" # Some utility functions: # Nonzero if first string contains second string (+globbing chars ?*[x-y]) contains() { [ -n "$1" -a -z "${1##*$2*}" ] } # Change this if you prefer wget, curl, etc. url2stdout() { lynx -dump -source "$1" 2>/dev/null } # Die with a message die() { echo "$*">/dev/stderr ; exit 1 } # extract a field from a string and echo to stdout # $1 - source string # $2 - before target field # $3 - prefix of target field # $4 - after target field extract() { [ -n "$1" ] || return 1; [ -z "${1##*$2$3*$4*}" ] || return 1; STRIP="$3${1##*$2$3}"; STRIP="${STRIP%%$4*}"; echo "$STRIP" } # verify the presence of a required executable verify() { [ -x $(which $1 ) ] || die "Error: required executable $1 not found" ] } #main() verify "lynx" verify "tiff2ps" verify "ps2pdf" verify "sed" verify "head" verify "mv" verify "rm" PATNUM=$( echo $1 | sed 's@,@@g' ) [ -z "$PATNUM" ] && die "usage: pat2pdf " [ -z "${PATNUM##[a-zA-Z1-9][a-zA-Z0-9][0-9][0-9]*}" ] || die "Use a 7 digit patent number." echo "...fetching search results page for patent $PATNUM" > /dev/stderr RESULTPAGE=$( url2stdout "${SITENAME}/netacgi/nph-Parser?TERM1=${PATNUM}&Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2Fsrchnum.htm&r=0&f=S&l=50" ) || die "Error fetching search results web page." contains "$RESULTPAGE" "No patents have matched" && die "No patents have matched your query." contains "$RESULTPAGE" "PAT. NO." || die "Search results page in unexpected format - please notify author." TITLE=$( extract "$RESULTPAGE" "RS=PN/???????>" "" "" ) || TITLE="##Error isolating patent title - continuing anyway##" echo "U.S. Patent $PATNUM: $TITLE" PATENTURL=$( extract "$RESULTPAGE" "HREF=" "/netacgi/nph-Parse" ">" ) || die "Error isolating URL from results page." PATENTURL="${SITENAME}${PATENTURL}" echo "...fetching patent page" > /dev/stderr PATENTPAGE=$( url2stdout "$PATENTURL" | head -c 3500 ) || die "Error fetching patent web page." IMAGEURL=$( extract "$PATENTPAGE" "a href=" "http://patimg" ">" ) || die "Error isolating image page URL from patent page." IMAGESERVER=$( extract "$IMAGEURL" "" "http://patimg" "/.piw" ) || die "Error isolating image server name." echo "...fetching images page" > /dev/stderr IMAGEPAGE=$( url2stdout "$IMAGEURL" ) || die "Error fetching images page." NUMPAGES=$( extract "$IMAGEPAGE" "-- NumPages=" "" " --" ) || die "Error getting number of pages." TIFFURL=$( extract "$IMAGEPAGE" "embed src=?" "/.DImg" "? width=" ) || die "Error getting TIFF file URL." TIFFURL="$IMAGESERVER$TIFFURL" contains "$TIFFURL" "PageNum=1" || die "Error processing TIFF file URL" TIFFURL1="${TIFFURL%%PageNum=1*}PageNum=" TIFFURL2="${TIFFURL##*PageNum=1}" PAGE=1 { while [ "$PAGE" -le "$NUMPAGES" ] ; do echo "...fetching page $PAGE of $NUMPAGES" > /dev/stderr url2stdout "${TIFFURL1}${PAGE}${TIFFURL2}" > "pattmp${PATNUM}.tiff" || die "Error retrieving TIFF page." tiff2ps "pattmp${PATNUM}.tiff" 2>/dev/null || die "tiff2ps error" PAGE=$[$PAGE+1] done } | { ps2pdf /dev/stdin "tmppat${PATNUM}.pdf" || die "GhostScript error." } rm -f "pattmp${PATNUM}.tiff" mv -f "tmppat${PATNUM}.pdf" "pat${PATNUM}.pdf" || die "Error renaming pdf file." echo Done. >/dev/stderr echo pat${PATNUM}.pdf >/dev/stderr