[MLton] Getting a local snapshot of the Wiki

Vesa Karvonen vesa.karvonen@cs.helsinki.fi
Mon, 25 Jul 2005 04:15:56 +0300


Below is a script I hacked together to get a local (HTML) snapshot of
the MLton Wiki for offline viewing. Beware that the script also downloads
all the attachments.

<--- begin script --->
#!/bin/bash

TIMEFORMAT='  Took %lR.' &&

base='http://www.mlton.org' &&
targetDir='mlton.org' &&

mkdir -p $targetDir &&
cd $targetDir &&

echo "Getting index:" &&
time lynx -dump "$base/Index?action=titleindex" | \
    grep -v -e 'MoinEditorBackup' -e '^$' -e '^Preferences$' > .index &&
echo "Index" >> .index &&

echo "Getting pages:" &&
time wget -c -B $base -nv -i .index &&

echo "Getting site images:" &&
cat .index | xargs cat | grep -o -e ' src="/[^"]*"' | \
    sed -e 's#^ src="/##g' -e 's#"$##g' -e 's#\?.*##g' | \
    sort | uniq > .images &&
time wget -c -B $base -x -nH -nv -i .images &&

echo "Getting attachments:" &&
cat .index | xargs cat | grep -o -e ' href="/[^"]*"' | \
    sed -e 's#^ href="/##g' -e 's#"$##g' -e 's#\?.*##g' | grep -e '/' | \
    grep -v -e 'MoinEditorBackup' | \
    sort | uniq > .attachments &&
time wget -c -B $base -x -nH -nv -i .attachments &&

echo "Fixing pages:" &&
sed -e 's#^\(.*\)$#s@= *"/\1"@="\1"@g#g' .index                       > .script &&
sed -e 's#^\(.*\)$#s@= *"/\1\\(\\?[^"]*\\)*"@="\1"@g#g' .images      >> .script &&
sed -e 's#^\(.*\)$#s@= *"/\1\\(\\?[^"]*\\)*"@="\1"@g#g' .attachments >> .script &&
echo 's# href *= *"Preferences"# href="'$base'/Preferences"#g'       >> .script &&
echo 's# href *= *"/"# href="Home"#g'                                >> .script &&
echo 's# href *= *"/# href="'$base'/#g'                              >> .script &&
time for f in $(cat .index) ; do
    echo " $f" &&
    result="$(sed -f .script $f)" &&
    echo "$result" > $f ;
done
<--- end script --->