#!/usr/local/bin/perl

# Program name: make-dbm-of-urls
# Installed in: /inet/programs/

# Version: 1.0 1996/Sep/22

# Author: Rajiv Pant (Betul)  [email protected]  http://rajiv.org
# & Ranjit Bhatnagar  [email protected]  http://moonmilk.volcano.org

# Note: You will need to adjust all folder locations in this program
# to suit your system.

# Purpose/Description:
# --------------------
#
# Makes a list (actually a dbm hash) of all the files under the web
# document root.
# The keys to the hash are the file paths in all lower case letters,
# the corresponding values are the actual pathnames which may be in
# mixed case.
#
# This list is used by:
# * A server API/CGI program to make the Unix based web server ignore
# upper/lower case when someone requests a url like NT does. When a page
# is not found, the server runs an api module or cgi program that converts
# the url to all lowercase and checks this hash table, if the page is
# found, it forwards the browser to it. If not, it gives the usual
# not found message.
# * The indexing program to make the site searchable.
#
#
# Q. Why would I want to make my Unix web server ignore case in URLs ?
# A. Several reasons. Many sites use a mixture of naming conventions
#    especially when many people work on the site. Also, when people
#    upload files to the unix servers from PCs or MACs, the case may
#    vary depending on program used to transfer, it's configuration,
#    and the file name itself.
#    Also, if your unix server shares a disk with an NT server using
#    Samba or NFS, and you want to make it searchable using MS Index
#    Server or NT based search program, this ensures that URLs will
#    always work.
#    It makes it easier for you to give out your urls without saying
#    "with an uppercase F and a lowercase o".
#    If NT web servers do not care about case in URLs, why should unix ?
#
# Author: Rajiv Pant (Betul)  [email protected]  http://rajiv.org



# ---- Libraries used ----

require 5.003 ;

use File::Find ;	# Part of standard perl distribution.

use Fcntl ;		# Part of standard perl distribution.

# Note: If you do not have Berkeley DB installed, any of the
# other Perl DBMish modules (GDBM_File, NDBM_File, ODBM_File, SDBM_File
# will also suffice.)

use DB_File ; 		# Part of standard perl distribution.


# ---- /Libraries used ----



# --- Directories and files --- 

# This is the web server's document root. If you would like this
# program to handle some other virtual roots too, you should list
# them here.

$document_root	= '/disk2/web' ;


# $indices_dir is where the search indexes and some related files
# are stored.

$indices_dir	= '/datafiles/indices' ;


# $exclude_list is a list of folders under document root which
# should not be inclded in this list. Any folders inside these
# folders are also skipped. This plain text file follows a simple
# format which is explained below.

$exclude_list	= '/pin/pub/exclude-from-search.txt' ;


# $dbm_of_urls is the name of the dbm that will contain this hash
# table (associative array) of all lowercase urls to their real
# path names.

$dbm_of_urls	= $indices_dir . '/dbm_of_urls' ;


# --- /Directories and files --- 




# ---- Reading the exclude list ----

# A short, sample exclude list file follows.
# The file can contain comments. Any line containing a # is considered
# a comment. To use the sample file below, you will have to remove the
# comment sign and space "# " that prefixes each entry.
#
# -- Sample begins in next line --
# ads
# clients/mohan
# clients/vic/adultpages
# messages/error
# test
# -- Sample ends in previous line --

open (EL, $exclude_list) ;
while (>EL<)
  {
  s/\s//g ;		# Removing spaces, tabs and newlines.
  next if /#/ ;		# Skipping comments.
  next unless /\w/ ;	# Skipping blank lines.

  push @not_to_be_indexed, $_ ;
  }
close (EL) ;

#print join "\n", @not_to_be_indexed ; exit ; # debug

# ---- /Reading the exclude list ----




# ---- main ----

# Note: Depending on how you set up your system, you may want to
# first remove the existing dbm file before adding urls to it here.

tie %dbm_of_urls, DB_File, $dbm_of_urls, O_RDWR|O_CREAT, 0644 ;

&find (\&add_url_to_dbm, $document_root) ;

untie %dbm_of_urls ;

# ---- /main ----




# The add_url_to_dbm subroutine is called by the find subroutine as
# it recurses the directory tree. When the make_dir_list subroutine
# sees a directory in the not to be indexed list, it tells find() to
# not recurse any more into that folder any more. find skips to the
# next folder and the list gets built saving system resources that
# would have been wasted in a complete traversal.

sub add_url_to_dbm
{
if (-d and 
    grep $File::Find::name =~ /^$document_root\/$_\// , @not_to_be_indexed)
  { $File::Find::prune = 1 }

else
  {
  ($URL) = $File::Find::name =~ /^$document_root\/(.*)$/ ;
  ($in_lower_case = $URL) =~ tr/A-Z/a-z/ ;
  $dbm_of_urls{$in_lower_case} = $URL ;
  }

} # ---- end of sub add_url_to_dbm ----

# Author: Rajiv Pant (Betul)   [email protected]   http://rajiv.org