#!/usr/local/bin/perl # Program name: make-dbm-of-urls # Installed in: /inet/programs/ # Version: 1.0 1996/Sep/22 # Author: Rajiv Pant (Betul) [email protected] http://rajiv.org # & Ranjit Bhatnagar [email protected] http://moonmilk.volcano.org # Note: You will need to adjust all folder locations in this program # to suit your system. # Purpose/Description: # -------------------- # # Makes a list (actually a dbm hash) of all the files under the web # document root. # The keys to the hash are the file paths in all lower case letters, # the corresponding values are the actual pathnames which may be in # mixed case. # # This list is used by: # * A server API/CGI program to make the Unix based web server ignore # upper/lower case when someone requests a url like NT does. When a page # is not found, the server runs an api module or cgi program that converts # the url to all lowercase and checks this hash table, if the page is # found, it forwards the browser to it. If not, it gives the usual # not found message. # * The indexing program to make the site searchable. # # # Q. Why would I want to make my Unix web server ignore case in URLs ? # A. Several reasons. Many sites use a mixture of naming conventions # especially when many people work on the site. Also, when people # upload files to the unix servers from PCs or MACs, the case may # vary depending on program used to transfer, it's configuration, # and the file name itself. # Also, if your unix server shares a disk with an NT server using # Samba or NFS, and you want to make it searchable using MS Index # Server or NT based search program, this ensures that URLs will # always work. # It makes it easier for you to give out your urls without saying # "with an uppercase F and a lowercase o". # If NT web servers do not care about case in URLs, why should unix ? # # Author: Rajiv Pant (Betul) [email protected] http://rajiv.org # ---- Libraries used ---- require 5.003 ; use File::Find ; # Part of standard perl distribution. use Fcntl ; # Part of standard perl distribution. # Note: If you do not have Berkeley DB installed, any of the # other Perl DBMish modules (GDBM_File, NDBM_File, ODBM_File, SDBM_File # will also suffice.) use DB_File ; # Part of standard perl distribution. # ---- /Libraries used ---- # --- Directories and files --- # This is the web server's document root. If you would like this # program to handle some other virtual roots too, you should list # them here. $document_root = '/disk2/web' ; # $indices_dir is where the search indexes and some related files # are stored. $indices_dir = '/datafiles/indices' ; # $exclude_list is a list of folders under document root which # should not be inclded in this list. Any folders inside these # folders are also skipped. This plain text file follows a simple # format which is explained below. $exclude_list = '/pin/pub/exclude-from-search.txt' ; # $dbm_of_urls is the name of the dbm that will contain this hash # table (associative array) of all lowercase urls to their real # path names. $dbm_of_urls = $indices_dir . '/dbm_of_urls' ; # --- /Directories and files --- # ---- Reading the exclude list ---- # A short, sample exclude list file follows. # The file can contain comments. Any line containing a # is considered # a comment. To use the sample file below, you will have to remove the # comment sign and space "# " that prefixes each entry. # # -- Sample begins in next line -- # ads # clients/mohan # clients/vic/adultpages # messages/error # test # -- Sample ends in previous line -- open (EL, $exclude_list) ; while (>EL<) { s/\s//g ; # Removing spaces, tabs and newlines. next if /#/ ; # Skipping comments. next unless /\w/ ; # Skipping blank lines. push @not_to_be_indexed, $_ ; } close (EL) ; #print join "\n", @not_to_be_indexed ; exit ; # debug # ---- /Reading the exclude list ---- # ---- main ---- # Note: Depending on how you set up your system, you may want to # first remove the existing dbm file before adding urls to it here. tie %dbm_of_urls, DB_File, $dbm_of_urls, O_RDWR|O_CREAT, 0644 ; &find (\&add_url_to_dbm, $document_root) ; untie %dbm_of_urls ; # ---- /main ---- # The add_url_to_dbm subroutine is called by the find subroutine as # it recurses the directory tree. When the make_dir_list subroutine # sees a directory in the not to be indexed list, it tells find() to # not recurse any more into that folder any more. find skips to the # next folder and the list gets built saving system resources that # would have been wasted in a complete traversal. sub add_url_to_dbm { if (-d and grep $File::Find::name =~ /^$document_root\/$_\// , @not_to_be_indexed) { $File::Find::prune = 1 } else { ($URL) = $File::Find::name =~ /^$document_root\/(.*)$/ ; ($in_lower_case = $URL) =~ tr/A-Z/a-z/ ; $dbm_of_urls{$in_lower_case} = $URL ; } } # ---- end of sub add_url_to_dbm ---- # Author: Rajiv Pant (Betul) [email protected] http://rajiv.org