#!/bin/bash
# Script to generate HTML Sitemap based on urllist.txt
# Copyright Ben Tasker 2009
# Released under the GNU GPL
cd /tmp
# Path to Url listing
URLLIST="/path/to/urllist.txt"
# Path to Sitemap Template
TEMPLATE="/path/to/sitemap_template.html"
# Where should the new sitemap be placed
SITEMAPLOCAT="/path/to/sitemap.html"
# Prepare the Template Header
cp $TEMPLATE ./sitemaptemplate.html
# Strip Newlines to make processing easier
tr -d "\n" < sitemaptemplate.html > sitemaptmp
# Now lets just take the header
PAGEHEAD=$(cat sitemaptmp | awk -F "" '{print $2;}')
echo $PAGEHEAD > sitemap.html
echo "
Content Types
" >> sitemap.html
echo "
HTML" >> sitemap.html
echo "PDF" >> sitemap.html
echo "Plain Text" >> sitemap.html
echo "Images" >> sitemap.html
echo "Other Document Types" >> sitemap.html
echo "
" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
# URL Processing
# Lets put the HTML pages first
echo "
HTML
" >> sitemap.html
for i in $( cat $URLLIST )
do
BYPASS="N"
FILEFAIL="N"
# Sometimes wget igonores the A option because a link is specified, so lets do a quick check on the address
echo "$i" | grep '[.]html' > /dev/null
A=$?
echo "$i" | grep '[.]shtml' > /dev/null
B=$?
echo "$i" | grep '[.]php' > /dev/null
C=$?
echo "$i" | grep '[.]asp' > /dev/null
D=$?
echo "$i" | grep '[.]htm' > /dev/null
E=$?
echo "$i" | grep '[.]xhtml' > /dev/null
F=$?
# This could be done in less lines with a long RegExp but I've got a bit of a headache!
# Ok so we know that we have run 6 checks, if the file contains one of those filename extensions the sum of all of them should be
# 5, grep returns 0 when it finds a string and 1 when it doesn't. So if the total is 6 it didn't find any
CHECKING=$(( $A + $B + $C + $D + $E + $F))
if [ "$CHECKING" == "6" ]
then
# File does not have a recognised extension
echo "Non HTML/SHTML/XHTML/ASP/PHP file found at $i"
echo "Skipping"
FILEFAIL=Y
fi
if [ "$FILEFAIL" == "N" ]
then
# Get the page
wget -q -O tmpfile "$i";
# Check the page actually existed, wget returns 0 for success and 1 for failure
if [ "$?" == "1" ]
then
# It failed
BYPASS="Y"
fi
if [ "$BYPASS" == "N" ]
then
# Find the Title line (it may span more than one)
cat tmpfile | grep "title>" > tmpfile2
# Strip out any newlines
tr -d "\n" < tmpfile2 > tmpfile
# now get the text between and
TitleText=$(cat tmpfile | awk -F "title>" '{print $2;}')
echo "$TitleText" > tmpfile2
# Tmpfile now contains the title byt it has the suffix tmpfile
# OK now lets turn that file into a variable
PAGETITLE=$( cat tmpfile )
# Now lets generate the sitemap to add it into the sitemap
echo -n '
> sitemap.html
echo -n '" target=_blank>' >> sitemap.html
# Insert the title
echo -n "$PAGETITLE " >> sitemap.html
# OK it's inserted
# But to keep the file readable put a newline into the file
echo "" >> sitemap.html
# Now lets tidy up before generating the next bit
rm -f tmpfile
rm -f tmpfile2
else
# We are bypassing because the Wget operation failed
echo "Retrieval of $i failed"
echo "$i has not been included in the sitemap"
fi
else
# File type was wrong
echo "File $i has not been inserted into sitemap"
fi
done
# URL Processing
# Lets put the PDF's Next
echo "
PDF
" >> sitemap.html
for i in $( cat $URLLIST )
do
BYPASS="N"
FILEFAIL="N"
# Sometimes wget igonores the A option because a link is specified, so lets do a quick check on the address
echo "$i" | grep '[.]pdf' > /dev/null
A=$?
echo "$i" | grep '[.]PDF' > /dev/null
B=$?
# This could be done in less lines with a long RegExp but I've got a bit of a headache!
CHECKING=$(( $A + $B ))
if [ "$CHECKING" == "2" ]
then
# File does not have a recognised extension
echo "Non PDF file found at $i"
echo "Skipping"
FILEFAIL=Y
fi
if [ "$FILEFAIL" == "N" ]
then
# Get the page
wget -q -O tmpfile "$i";
# Check the page actually existed, wget returns 0 for success and 1 for failure
if [ "$?" == "1" ]
then
# It failed
BYPASS="Y"
fi
if [ "$BYPASS" == "N" ]
then
# Can't find the Title so easily in a PDf so lets just post the link
echo -n '
> sitemap.html
echo -n '" target=_blank>' >> sitemap.html
# Insert the title
echo -n "$i " >> sitemap.html
# OK it's inserted
# But to keep the file readable put a newline into the file
echo "" >> sitemap.html
# Now lets tidy up before generating the next bit
rm -f tmpfile
rm -f tmpfile2
else
# We are bypassing because the Wget operation failed
echo "Retrieval of $i failed"
echo "$i has not been included in the sitemap"
fi
else
# File type was wrong
echo "File $i has not been inserted into sitemap on this pass"
fi
done
# URL Processing
# Lets put the Text Files Next
echo "
Plain Text
" >> sitemap.html
for i in $( cat $URLLIST )
do
BYPASS="N"
FILEFAIL="N"
# Sometimes wget igonores the A option because a link is specified, so lets do a quick check on the address
echo "$i" | grep '[.]txt' > /dev/null
A=$?
echo "$i" | grep '[.]TXT' > /dev/null
B=$?
# This could be done in less lines with a long RegExp but I've got a bit of a headache!
CHECKING=$(( $A + $B ))
if [ "$CHECKING" == "2" ]
then
# File does not have a recognised extension
echo "Non Text file found at $i"
echo "Skipping"
FILEFAIL=Y
fi
if [ "$FILEFAIL" == "N" ]
then
# Get the page
wget -q -O tmpfile "$i";
# Check the page actually existed, wget returns 0 for success and 1 for failure
if [ "$?" == "1" ]
then
# It failed
BYPASS="Y"
fi
if [ "$BYPASS" == "N" ]
then
# Can't find the Title so easily in a PDf so lets just post the link
echo -n '
> sitemap.html
echo -n '" target=_blank>' >> sitemap.html
# Insert the title
echo -n "$i " >> sitemap.html
# OK it's inserted
# But to keep the file readable put a newline into the file
echo "" >> sitemap.html
# Now lets tidy up before generating the next bit
rm -f tmpfile
rm -f tmpfile2
else
# We are bypassing because the Wget operation failed
echo "Retrieval of $i failed"
echo "$i has not been included in the sitemap"
fi
else
# File type was wrong
echo "File $i has not been inserted into sitemap on this pass"
fi
done
# URL Processing
# Lets put the Image Files Next
echo "
Images
" >> sitemap.html
for i in $( cat $URLLIST )
do
BYPASS="N"
FILEFAIL="N"
# Sometimes wget igonores the A option because a link is specified, so lets do a quick check on the address
echo "$i" | grep -i '[.]gif' > /dev/null
A=$?
echo "$i" | grep -i '[.]jpg' > /dev/null
B=$?
echo "$i" | grep -i '[.]jpeg' > /dev/null
C=$?
echo "$i" | grep -i '[.]png' > /dev/null
D=$?
echo "$i" | grep -i '[.]bmp' > /dev/null
E=$?
echo "$i" | grep -i '[.]svc' > /dev/null
F=$?
echo "$i" | grep -i '[.]psd' > /dev/null
G=$?
# This could be done in less lines with a long RegExp but I've got a bit of a headache!
CHECKING=$(( $A + $B + $C +$D +$E +$F + $G ))
if [ "$CHECKING" == "7" ]
then
# File does not have a recognised extension
echo "Non Image file found at $i"
echo "Skipping"
FILEFAIL=Y
fi
if [ "$FILEFAIL" == "N" ]
then
# Get the page
wget -q -O tmpfile "$i";
# Check the page actually existed, wget returns 0 for success and 1 for failure
if [ "$?" == "1" ]
then
# It failed
BYPASS="Y"
fi
if [ "$BYPASS" == "N" ]
then
# Can't find the Title so easily in a PDf so lets just post the link
echo -n '
> sitemap.html
echo -n '" target=_blank>' >> sitemap.html
# Insert the title
echo -n "$i " >> sitemap.html
# OK it's inserted
# But to keep the file readable put a newline into the file
echo "" >> sitemap.html
# Now lets tidy up before generating the next bit
rm -f tmpfile
rm -f tmpfile2
else
# We are bypassing because the Wget operation failed
echo "Retrieval of $i failed"
echo "$i has not been included in the sitemap"
fi
else
# File type was wrong
echo "File $i has not been inserted into sitemap on this pass"
fi
done
# URL Processing
# Rarely post Documents from OpenOffice/Office, but just in case
echo "
Other Documents
" >> sitemap.html
for i in $( cat $URLLIST )
do
BYPASS="N"
FILEFAIL="N"
# We want all other file types to be listed here, so lets exclude the ones already indexed
echo "$i" | grep -iv '[.]html' > /dev/null
A=$?
echo "$i" | grep -iv '[.]htm' > /dev/null
B=$?
echo "$i" | grep -iv '[.]asp' > /dev/null
C=$?
echo "$i" | grep -iv '[.]php' > /dev/null
D=$?
echo "$i" | grep -iv '[.]pdf' > /dev/null
E=$?
echo "$i" | grep -iv '[.]jpg' > /dev/null
F=$?
echo "$i" | grep -iv '[.]gif' > /dev/null
G=$?
echo "$i" | grep -iv '[.]jpeg' > /dev/null
H=$?
echo "$i" | grep -iv '[.]png' > /dev/null
I=$?
echo "$i" | grep -iv '[.]bmp' > /dev/null
J=$?
echo "$i" | grep -iv '[.]txt' > /dev/null
K=$?
echo "$i" | grep -iv '[.]shtml' > /dev/null
L=$?
echo "$i" | grep -iv '[.]xhtml' > /dev/null
M=$?
echo "$i" | grep -iv '[.]svc' > /dev/null
N=$?
echo "$i" | grep -iv '[.]psd' > /dev/null
O=$?
# This could be done in less lines with a long RegExp but I've got a bit of a headache!
# Wget returns 1 if it doesn't find the string,so if CHECKING is less that 15 it found one
CHECKING=$(( $A + $B + $C +$D + $E + $F + $G + $H + $I + $J + $K + $L + $M + $N + $O ))
if [ "$CHECKING" == "0" ]
then
# File does not have a recognised which means we want to add it
if [ "$FILEFAIL" == "N" ]
then
# Get the page
wget -q -O tmpfile "$i";
# Check the page actually existed, wget returns 0 for success and 1 for failure
if [ "$?" == "1" ]
then
# It failed
BYPASS="Y"
fi
if [ "$BYPASS" == "N" ]
then
# Can't find the Title so easily in a PDf so lets just post the link
echo -n '
> sitemap.html
echo -n '" target=_blank>' >> sitemap.html
# Insert the title
echo -n "$i " >> sitemap.html
# OK it's inserted
# But to keep the file readable put a newline into the file
echo "" >> sitemap.html
# Now lets tidy up before generating the next bit
rm -f tmpfile
rm -f tmpfile2
else
# We are bypassing because the Wget operation failed
echo "Retrieval of $i failed"
echo "$i has not been included in the sitemap"
fi
else
# File type was wrong
echo "File $i has not been inserted into sitemap on this pass"
fi
else
echo "Previously indexed Filetype found at $i"
echo "Skipping"
FILEFAIL=Y
fi
done
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
echo "" >> sitemap.html
# Bung the footer on the end
PAGEFOOT=$(cat sitemaptmp | awk -F "" '{print $2;}')
echo $PAGEFOOT >> sitemap.html
rm sitemaptmp
# Move the sitemap to the desired place
mv sitemap.html $SITEMAPLOCAT
echo "Finished!"