aboutsummaryrefslogtreecommitdiff
path: root/www/googlebook_dl
diff options
context:
space:
mode:
authorAlex Kozlov <ak@FreeBSD.org>2013-10-21 11:23:56 +0000
committerAlex Kozlov <ak@FreeBSD.org>2013-10-21 11:23:56 +0000
commite4f71f27d5a6d24008c72531c2a6bf88b652d178 (patch)
tree8744500d098e9bed4fc4e7ccf551d792d27df298 /www/googlebook_dl
parent6ee9a8e343e6af40b5836206944980ac502b2b32 (diff)
downloadports-e4f71f27d5a6d24008c72531c2a6bf88b652d178.tar.gz
ports-e4f71f27d5a6d24008c72531c2a6bf88b652d178.zip
Notes
Diffstat (limited to 'www/googlebook_dl')
-rw-r--r--www/googlebook_dl/Makefile11
-rw-r--r--www/googlebook_dl/files/googlebook_dl.sh246
2 files changed, 189 insertions, 68 deletions
diff --git a/www/googlebook_dl/Makefile b/www/googlebook_dl/Makefile
index 32f345617269..d1034bd8fe40 100644
--- a/www/googlebook_dl/Makefile
+++ b/www/googlebook_dl/Makefile
@@ -1,14 +1,15 @@
-# Created by: <spam@rm-rf.kiev.ua>
# $FreeBSD$
PORTNAME= googlebook_dl
-PORTVERSION= 20100502
+PORTVERSION= 20120817
CATEGORIES= www
MASTER_SITES= # none
DISTFILES= # none
MAINTAINER= ak@FreeBSD.org
-COMMENT= A command-line utility for downloading books from Google Books
+COMMENT= Command-line utility for downloading books from Google Books
+
+LICENSE= BSD
RUN_DEPENDS= wget:${PORTSDIR}/ftp/wget
@@ -16,8 +17,8 @@ NO_BUILD= yes
PLIST_FILES= bin/${PORTNAME}
-NO_STAGE= yes
do-install:
- ${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh ${PREFIX}/bin/${PORTNAME}
+ ${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh \
+ ${STAGEDIR}${PREFIX}/bin/${PORTNAME}
.include <bsd.port.mk>
diff --git a/www/googlebook_dl/files/googlebook_dl.sh b/www/googlebook_dl/files/googlebook_dl.sh
index e8afa512c327..cfc5f5bf9679 100644
--- a/www/googlebook_dl/files/googlebook_dl.sh
+++ b/www/googlebook_dl/files/googlebook_dl.sh
@@ -1,8 +1,12 @@
#!/bin/sh
+#
+# SUBS
+#
+
parse_options()
{
- local OPT OPTARG OPTIND
+ local _proxylist
while getopts ap:P:vw: OPT; do
# escape meta
@@ -10,7 +14,13 @@ parse_options()
case ${OPT} in
a) all=yes ;;
- p) proxylist="${OPTARG}" ;;
+ p) _proxylist="${OPTARG}"
+ if [ -r "${_proxylist}" ]; then # file
+ proxylist=$(cat "${_proxylist}")
+ else # list
+ proxylist=$(echo "${_proxylist}" | sed -e 's/,/ /g')
+ fi
+ ;;
P) pageprefix="${OPTARG}" ;;
v) verbose=yes ;;
w) pagewidth="${OPTARG}" ;;
@@ -21,97 +31,204 @@ parse_options()
OPTC=$((${OPTIND} - 1))
}
+#
+# returns true if argument is a positive/negative whole integer.
+# stolen from bsdinstall
+#
+isinteger()
+{
+ local arg="$1"
+
+ # prevent division-by-zero
+ [ "${arg}" = "0" ] && return
+
+ # attempt to perform arithmetic divison (an operation which will exit
+ # with error unless arg is a valid positive/negative whole integer).
+ ( : $((0/$arg)) ) > /dev/null 2>&1
+}
+
+err()
+{
+ local exitval
+
+ exitval=$1
+ shift
+ echo 1>&2 "${0##*/}: $*"
+ exit ${exitval}
+}
+
usage()
{
- echo "usage: ${0##*/} [-ahPpw] totpages bookid"
+ echo "usage: ${0##*/} [-ahPpvw] totpages bookid"
echo ' -h display this help'
- echo ' -a all mode (try to get sigs from all pages, including already downloaded)'
+ echo ' -a all mode (try to get links from all pages, including already downloaded)'
echo ' -P pageprefix (*PA, PP, PR, PT)'
- echo ' -p proxylist'
+ echo ' -p http://proxy.tld:port,proxy.tld,ip:port | proxylist.txt'
echo ' -v verbose'
echo ' -w pagewidth (800, *1024, 1280, 1440, 1680, ...)'
echo
exit 1
}
-get_pages()
+#
+# shows progress in dots/got_page numbers
+# stolen from portsnap
+#
+progress()
+{
+ local page
+
+ page=$1
+ if [ $((${page} % 10)) -eq 0 -a "${lastchar}" = '.' ]; then
+ echo -n ${page}
+ elif [ $((${page} % 2)) -eq 0 ]; then
+ echo -n .
+ fi
+}
+
+#
+# out $msg $verbose_msg
+#
+out()
{
- local ua page url _return
+ [ -z "$1" -a -z "$2" ] && err 3 'out(): bad syntax'
- # with wrong ua we will get 401 Unauthorized
- # ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0'
- ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)'
+ if [ -n "${verbose}" -a -n "$2" ]; then
+ echo $2
+ elif [ -z "${verbose}" -a ! -z "$1" ]; then
+ [ "$1" = '.' ] && lastchar=.
+ case ${lastchar} in
+ [.ce]) printf "$1" ;;
+ *) printf " $1" ;;
+ esac
+ lastchar=${1#${1%?}}
+ fi
+}
+
+get_cookie()
+{
+ local cookie_str _return
+
+ # remove old cookie
+ rm "${cookie}" 2>/dev/null
# get cookie
wget -T5 -t2 -q -U"${ua}" --keep-session-cookies \
- --save-cookies "${DIR}/cookies.txt" -O/dev/null \
+ --save-cookies "${cookie}" -O/dev/null \
"http://books.google.com/books?id=${bookid}&pg=PA1&jscmd=click3"
- # bail if wget returned non zero exit code or cookies.txt is empty
+ # fail if wget returned non-zero exitcode or cookies.txt is empty
_return=$?
- cookie="$(grep '^.google.com' "${DIR}/cookies.txt" 2>/dev/null | \
- sed 's/^.*\(ID.*\)$/\1/')"
- [ ${_return} -ne 0 -o -z "${cookie}" ] && \
- { rm "${DIR}/cookies.txt"; return 1; }
+ cookie_str="$(grep '^.google.com[[:space:]]' "${cookie}" 2>/dev/null | \
+ sed -ne 's/^.*\(ID=.*\)$/\1/p')"
+ if [ ${_return} -ne 0 -o -z "${cookie_str}" ]; then
+ rm "${cookie}" 2>/dev/null
+ out 'E\n' "cannot get cookie: ${cookie_str}"
+ return 1
+ fi
# show cookie
- [ -n "${verbose}" ] && echo "cookie: ${cookie}"
-
- # if downloaded less that half of total pages, use all mode
- [ $(ls "${bookid}/" | wc -l) -le $((${totpages} / 2)) ] && all=yes
-
- # pull sigs only from missing pages unless in all mode
- page=1
- while [ ${page} -le ${totpages} ]; do
- [ -f "${bookid}/${pageprefix}${page}" -a -z "${all}" ] || \
- echo "http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3" \
- >> "${DIR}/urls"
- page=$(( ${page} + 1))
- done
+ out 'c' "cookie: ${cookie_str}"
+}
+
+get_page()
+{
+ local page url urls _return
+
+ [ -z $1 ] && err 3 'get_page(): bad syntax'
+ page=$1
+
+ # pull signatures only from missing pages unless in all mode
+ [ -f "${bookid}/${pageprefix}${page}.png" -a -z "${all}" ] && return
- # get all sigs at once
- # NB! sigs tied to cookie and ip
- wget -T5 -t2 -q -U"${ua}" --no-cache --load-cookies "${DIR}/cookies.txt" \
- -O- -i "${DIR}/urls" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \
- sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u | \
- while read -r url; do
+ # change cookie every 100 pages
+ if [ $((${got_pages} % 100)) -eq 0 ]; then
+ get_cookie || return 1
+ fi
+ got_pages=$((${got_pages} + 1))
+
+ url="http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3"
+ out "$(progress ${got_pages})" "${pageprefix}${page}: ${url}&w=${pagewidth} TRY"
+
+ # NB! signatures tied to cookie and ip
+ urls=$(wget -T5 -t2 -q -U"${ua}" --no-cache \
+ --load-cookies "${cookie}" -O- \
+ "${url}" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \
+ sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u)
+
+ for url in ${urls}; do
page=$(echo "${url}" | sed 's/^.*&pg=\([^&]*\)&.*$/\1/')
- [ -n "${verbose}" ] && verbose="${page}: ${url}&w=${pagewidth}"
+ # check again if page already downloaded, we usually get a few
+ # urls from a single request
+ if [ ! -f "${bookid}/${page}.png" ]; then
+ got_pages=$((${got_pages} + 1))
- # skip already downloaded pages
- [ -f "${bookid}/${page}" ] || \
- {
wget -T5 -t3 -q -U"${ua}" --no-cache \
- --load-cookies "${DIR}/cookies.txt" \
- -O"${bookid}/${page}" "${url}&w=${pagewidth}"
+ --load-cookies "${cookie}" \
+ -O"${bookid}/${page}.png" "${url}&w=${pagewidth}"
_return=$?
if [ ${_return} -ne 0 ]; then
- # sometimes google books returns 404
- rm "${bookid}/${page}"
- [ -n "${verbose}" ] && verbose="${verbose} ERROR"
+ # sometime google books just returns 404
+ rm "${bookid}/${page}.png"
+ out 'e' "${page}: ${url}&w=${pagewidth} ERROR"
else
- if [ -n "${verbose}" ]; then
- verbose="${verbose} DOWNLOADED"
- else
- echo -n "${page} "
- fi
+ out "${page}" "${page}: ${url}&w=${pagewidth} DOWNLOADED"
fi
- }
-
- [ -n "${verbose}" ] && echo "${verbose}"
+ else
+ out '' "${page}: ${url}&w=${pagewidth} ALREADY"
+ fi
done
- # clean temp files
- rm "${DIR}/cookies.txt" "${DIR}/urls"
+}
+
+get_pages()
+{
+ local page got_pages
+
+ # for out(), progress()
+ local lastchar=.
+
+ got_pages=1
+
+ # randomize page requests - google books only shows 200 - 300 urls in one
+ # session
+ #
+ # if start on odd second count from 1 to totpages, on even - from totpages to 1
+ # [ $((`date -j "+%s"` % 2)) -eq 0 ] && descending_order=yes
+ # XXX not portable
+ if [ $(jot -r 1 0 1) -ne 0 ]; then
+ echo "fetching pages in ascending order"
+
+ get_cookie || return 1
+ page=1
+ while [ ${page} -le ${totpages} ]; do
+ get_page ${page} || return 1
+ page=$((${page} + 1))
+ done
+ else
+ echo "fetching pages in descending order"
+
+ get_cookie || return 1
+ page=${totpages}
+ while [ ${page} -ge 1 ]; do
+ get_page ${page} || return 1
+ page=$((${page} - 1))
+ done
+ fi
echo
}
+
#
# MAIN
#
+# with wrong UserAgent we will get 401 Unauthorized
+# ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0'
+ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)'
+
# default page width
pagewidth=1024
@@ -124,28 +241,31 @@ pageprefix=PA
parse_options ${1+"$@"}
shift ${OPTC}
+isinteger "${pagewidth}" || err 4 "pagewidth must be integer: ${pagewidth}"
+
[ -z $1 ] && usage
totpages=$1
+isinteger "${totpages}" || err 4 "totpages must be integer: ${totpages}"
[ -z $2 ] && usage
bookid=$2
-# if bookid dir already exists, continue from previous try
-[ -d "${bookid}" ] || \
-{
- mkdir "${bookid}" || { echo "cannot create dir ${bookid}"; exit 2; }
-}
+# if bookid dir already exist, continue from previous try
+if [ ! -d "${bookid}" ]; then
+ mkdir -- "${bookid}" || err 2 "cannot create dir ${bookid}"
+fi
+
+cookie=`mktemp -t cookie` || err 2 'mktemp error'
-DIR=`mktemp -d googlebook_dl.XXXXXXXXXX` || exit 2
-trap "rm -rf ${DIR}; exit 1" 1 2 3 10 13 15
+trap "rm ${cookie} 2>/dev/null; exit 1" 1 2 3 10 13 15
if [ -z "${proxylist}" ]; then
get_pages
else
- for http_proxy in `cat "${proxylist}"`; do
+ for http_proxy in ${proxylist}; do
echo "using proxy ${http_proxy}"
get_pages
done
fi
-rmdir "${DIR}"
+rm "${cookie}" 2>/dev/null