[MLton] Getting a local snapshot of the Wiki
Vesa Karvonen
vesa.karvonen@cs.helsinki.fi
Mon, 25 Jul 2005 04:15:56 +0300
Below is a script I hacked together to get a local (HTML) snapshot of
the MLton Wiki for offline viewing. Beware that the script also downloads
all the attachments.
<--- begin script --->
#!/bin/bash
TIMEFORMAT=' Took %lR.' &&
base='http://www.mlton.org' &&
targetDir='mlton.org' &&
mkdir -p $targetDir &&
cd $targetDir &&
echo "Getting index:" &&
time lynx -dump "$base/Index?action=titleindex" | \
grep -v -e 'MoinEditorBackup' -e '^$' -e '^Preferences$' > .index &&
echo "Index" >> .index &&
echo "Getting pages:" &&
time wget -c -B $base -nv -i .index &&
echo "Getting site images:" &&
cat .index | xargs cat | grep -o -e ' src="/[^"]*"' | \
sed -e 's#^ src="/##g' -e 's#"$##g' -e 's#\?.*##g' | \
sort | uniq > .images &&
time wget -c -B $base -x -nH -nv -i .images &&
echo "Getting attachments:" &&
cat .index | xargs cat | grep -o -e ' href="/[^"]*"' | \
sed -e 's#^ href="/##g' -e 's#"$##g' -e 's#\?.*##g' | grep -e '/' | \
grep -v -e 'MoinEditorBackup' | \
sort | uniq > .attachments &&
time wget -c -B $base -x -nH -nv -i .attachments &&
echo "Fixing pages:" &&
sed -e 's#^\(.*\)$#s@= *"/\1"@="\1"@g#g' .index > .script &&
sed -e 's#^\(.*\)$#s@= *"/\1\\(\\?[^"]*\\)*"@="\1"@g#g' .images >> .script &&
sed -e 's#^\(.*\)$#s@= *"/\1\\(\\?[^"]*\\)*"@="\1"@g#g' .attachments >> .script &&
echo 's# href *= *"Preferences"# href="'$base'/Preferences"#g' >> .script &&
echo 's# href *= *"/"# href="Home"#g' >> .script &&
echo 's# href *= *"/# href="'$base'/#g' >> .script &&
time for f in $(cat .index) ; do
echo " $f" &&
result="$(sed -f .script $f)" &&
echo "$result" > $f ;
done
<--- end script --->