Mini HOWTO wget
Mini HOWTO wget
Remember, the man is your friend.
man wget | less
# k keys to scroll up
# j keys to scroll down
# /str search forward
# ?str search backward
# 1G top of man page
# G bottom of man page
Commonly used options
[options]
-P dirname # save files into directory named dirname
-p # download all pages for proper viewing
-E # save text/html pages as .html
-r # recursively
-k # convert links for local viewing
-U "Mozilla" # send request as a Mozilla user agent (web browser)
-w N # wait N seconds between download requests
--no-parent # do not recurse into parent
-nd # do not create heirarchy of directories
-nH # do not create host directory
--cut-dirs=N # cut number of directories to create
Examples - wget
-
wget [-k] convert links for proper viewing, no descending directory
wget -k http://phoenix.csc.calpoly.edu/~st50/index.html # | # convert links chmod 644 index.html # chmod of file for web server -
wget [-p] [-k] get all resources for proper viewing, convert links, with no subdirectories
wget -p -k -nd http://phoenix.csc.calpoly.edu/~st50/index.html # | | | # | | do not create descending directory # | convert links # get resources for proper viewing chmod 644 index.html # chmod of file for web server -
wget [-P] put into directory
# Put into directory st50/ all resources with links converted for proper viewing wget -P mydir -p -k -nd http://phoenix.csc.calpoly.edu/~st50/index.html # | | | | | # | | | | do not create subdirectories # | | | convert links # | | get all resources for proper viewing # | dirname # put into dirname chmod 755 st50 # chmod of dir for web server chmod 644 st50/index.html # chmod of file for web server -
wget recursively, no parent, no host, cut dirs, etc.
wget -P st50 -r -p -k --no-parent -nH --cut-dirs=1 http://phoenix.csc.calpoly.edu/~st50/index.html # | | | | | | | | # | | | | | | | do not create first subdirectory, i.e. ~st50 # | | | | | | do not create hostname directory # | | | | | do not recurse into parent # | | | | convert links # | | | get all resources for proper viewing # | | get recursively, create subdirectories # | dirname # put into dirname find st50 -type d -exec chmod 755 {} \; # find all dirs in st50 and chmod for web server find st50 -type f -exec chmod 644 {} \; # find all files in st50 and chmod for web server
Script - wcopy
#!/usr/bin/bash
#
# wcopy
#
# Kurt Voelker
# 10/26/2005
#
# DESCRIPTION:
#
# wcopy fetches all files local to a URL and copies them into a directory,
# sets the perms for a webserver, and returns the name of a link to the file.
#
# Note: this script was designed for fetching individual webpages, not entire
# websites, and is not setup to handle .asp files, or cookies.
#
# USAGE:
#
# $ wcopy dirname url # http://path/to/index.html or http://path/to/
#
# EXAMPLE: Fetch url and store in directory, returns filename of link
#
# $ wcopy st50 http://phoenix.csc.calpoly.edu/~st50/index.html
# --13:34:44-- http://phoenix.csc.calpoly.edu/%7Est50/index.html
# => `st50/index.html'
# Resolving phoenix.csc.calpoly.edu... done.
# Connecting to phoenix.csc.calpoly.edu[129.65.242.1]:80... connected.
# HTTP request sent, awaiting response... 200 OK
# Length: 867 [text/html]
#
# 100%[====================================>] 867 846.68K/s ETA 00:00
# ...
# ...
# ...
# FINISHED --13:34:44--
# Downloaded: 110,627 bytes in 9 files
# Converting st50/index.html... 5-1
# Converting st50/helloworld.html... nothing to do.
# Converting st50/index1.html... 2-0
# Converting st50/helloworld1.html... 1-0
# Converted 4 files in 0.04 seconds.
# st50/index.html
# \
# OUTPUT: \
# \
# Filename, i.e. link to downloaded file.
#
# RETURNS:
#
# Success exit 0
# Failure exit 1
#
# Check for improper input
if [ $# -ne 2 ]
then
echo Usage: `basename $0` dirname URL
exit 1
fi
if [ -e $1 ]
then
echo $1 already exists
exit 1
fi
if [ -z `echo "$2" | egrep '^http://|^ftp://'` ]
then
echo Usage: `basename $0` dirname URL \# URL is http://... or ftp://...
exit 1
fi
# Calculate --cut-dir number, and assign to num
num=`echo "$2" | grep '^http://' | sed -e 's/.*\/\/[^\/]*\///' -e 's/[^\/]//g' | wc -c`
num=`expr $num - 1`
if [ $num -le 0 ]
then
num=`expr 0`
fi
# Use wget to retrieve webpages and store into $1
wget -P $1 -p -k -E -U Mozilla --no-parent -nH --cut-dirs=$num "$2"
if [ -f $1/url ]
then
echo "$1/url already exists!"
else
echo $2 > $1/url # save url
fi
# Create link
link=`echo $2 | grep '/$'`
if [ -n $link ]
then
link=$1/index.html
else
link=$1/`basename $2`
fi
if [ $link != $1/index.html ] # link != $1/index.html
then
if [ -e $1/index.html ] # AND $1/index.html exists
then
:
else # AND $1/index.html does not exist
ln $1/$link $1/index.html # make $1/index.html link
link=$1/index.html
fi
elif [ -e $1/index.html ] # link == $1/index.html AND exists
then
:
else # error
link=
fi
# Set perms for webserver
find $1 -type d -exec chmod 755 {} \;
find $1 -type f -exec chmod 644 {} \;
# Exit early if no link
if [ -z $link ]
then
exit 1
fi
echo $link
exit 0
# EOF

