#!/util/bin/perl
# KSearch v1.4
# Copyright (C) 2000 David Kim (kscripts.com)
# Parts of this script are Copyright
# www.perlfect.com (C)2000 G.Zervas. All rights reserved
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
use Benchmark; # time search
my $t0 = new Benchmark;
use locale;
use CGI;
use CGI::Carp qw(fatalsToBrowser);
use Fcntl;
{
$0 =~ /(.*)\//;
push @INC, $1 if $1;
}
###### You may have to add the full path to your configuration file below######
###############################################################################
require 'configuration/configuration.pl'; #CONFIGURATION PATH#
my $usehash = 1;
my $dbm_package;
# To use the -T switch uncomment the next 2 lines and comment the following 11 line section
# Note: You must have the DB_File perl module to run taint mode
# and add ./ in front of the CONFIGURATION PATH below.
#use DB_File;
#$dbm_package = 'DB_File';
if ($USE_DBM) {
package AnyDBM_File;
@ISA = qw(DB_File GDBM_File SDBM_File ODBM_File NDBM_File) unless @ISA;
for (@ISA) {
if (eval "require $_") {
if ($_ =~ /[SON]DBM_File/) {
$usehash = 0;
}
$dbm_package = $_;
last;
}
}
package main;
}
my %f_file_db; # file path
my %f_date_db; # file modification date
my %f_size_db; # file size
my %f_termcount_db; # number of non-space characters for score
my %descriptions_db; # file description
my %filenames_db; # file names
my %titles_db; # file title
my %contents_db; # file contents
my %alt_text_db; # alt text
my %meta_description_db; # meta descriptions
my %meta_keyword_db; # meta keywords
my %meta_author_db; # meta authors
my %links_db; # links
if ($USE_DBM) {
tie %f_file_db, $dbm_package, $F_FILE_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_FILE_DB_FILE: $!";
tie %f_date_db, $dbm_package, $F_DATE_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_DATE_DB_FILE: $!";
tie %f_size_db, $dbm_package, $F_SIZE_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_SIZE_DB_FILE: $!";
tie %f_termcount_db, $dbm_package, $F_TERMCOUNT_DB_FILE, O_RDONLY, 0755 or die "Cannot open $F_TERMCOUNT_DB_FILE: $!";
tie %descriptions_db, $dbm_package, $DESCRIPTIONS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $DESCRIPTIONS_DB_FILE: $!";
tie %titles_db, $dbm_package, $TITLES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $TITLES_DB_FILE: $!";
tie %filenames_db, $dbm_package, $FILENAMES_DB_FILE, O_RDONLY, 0755 or die "Cannot open $FILENAMES_DB_FILE: $!";
if ($usehash) { # get contents from DBM if no key/value size limits
tie %contents_db, $dbm_package, $CONTENTS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $CONTENTS_DB_FILE: $!";
}
if ($ALT_TEXT) {
tie %alt_text_db, $dbm_package, $ALT_TEXT_DB_FILE, O_RDONLY, 0755 or die "Cannot open $ALT_TEXT_DB_FILE: $!";
}
if ($META_DESCRIPTION) {
tie %meta_description_db, $dbm_package, $META_DESCRIPTION_DB_FILE, O_RDONLY, 0755 or die "Cannot open $META_DESCRIPTION_DB_FILE: $!";
}
if ($META_KEYWORD) {
tie %meta_keyword_db, $dbm_package, $META_KEYWORD_DB_FILE, O_RDONLY, 0755 or die "Cannot open $META_KEYWORD_DB_FILE: $!";
}
if ($META_AUTHOR) {
tie %meta_author_db, $dbm_package, $META_AUTHOR_DB_FILE, O_RDONLY, 0755 or die "Cannot open $META_AUTHOR_DB_FILE: $!";
}
if ($LINKS) {
tie %links_db, $dbm_package, $LINKS_DB_FILE, O_RDONLY, 0755 or die "Cannot open $LINKS_DB_FILE: $!";
}
}
my $query = new CGI;
my $html; # returned HTML page
my $query_terms_copy; # query
my $bare_query_terms; # original query
my @terms; # terms/phrases
my @checked_terms; # processed terms/phrases
my %stopwords; # keys are stopterms in query
my $stopwords_regex = ignore_terms(); # stopwords regular expression
my $subsearch; # true if search within results
my $search_within_results; # for subsearch loop
my $previous_query; # previous queries
my $previous_queries; # previous queries for subsearch to add to links in results page
my @previous_queries; # previous queries for subsearch for loop
my %previous_results; # previous results for subsearch loop
my $whole_word; # true if search for whole words
my $all; # true if search includes stop terms;
my $case_sensitive; # true if case sensitive
my $search_body; # true if search body
my $search_title; # true if search titles
my $search_meta_description; # true if search meta descriptions
my $search_meta_keyword; # true if search meta keywords
my $search_meta_author; # true if search meta authors
my $search_alt_text; # true if search alt text
my $search_links; # true if search links
my $search_url; # true if search url
my $add_plus; # if true, add + to all non +/- terms/phrases
my @plusf; # +boolean terms/phrases for search
my @minusf; # -boolean terms/phrases for search
my @otherf; # other terms/phrases for search
my @none; # +boolean terms/phrases without results
my @final_files; # final files
my %minus; # keys are files with -boolean term/phrase
my %clean_body;
my $delimitererror;
my $score; # Score header
my $weight_tip; # note to user about weights
my $totalmatches; # total match count
my $totalsize; # total size of all files with matches
my @sortedanswers; # final list of sorted answers
my %matches; # total matches for each file
my %score_numerator; # characters that match x weights applied
my %score_denominator; # total characters
my %finalscores; # final score for each file
# set sorting choice, results per page
$SORT_BY = $query->param('sort') if ($query->param('sort') eq "Scores" || $query->param('sort') eq "Dates" || $query->param('sort') eq "Matches" || $query->param('sort') eq "Sizes" || $query->param('sort') eq "Titles" || $query->param('sort') eq "File Names");
$RESULTS_PER_PAGE = $query->param('display') if ($query->param('display') >= 5 && $query->param('display') <= 100);
$show_matches = $query->param('showm');
# to search within previous results
if ($SEARCH_RESULTS && $query->param('pq') !~ /^\s*$/ && $query->param($FORM_INPUT_NAME) !~ /^\s*$/ && $query->param('help') != 1) {
$previous_queries = $query->param('pq').' ';
@previous_queries = split " ", CGI::unescape($query->param('pq'));
}
print $query->header;
start_search();
##Subroutines############
sub start_search {
my $query_terms; # initialize variables
$score = 'Score:';
$totalmatches = ""; $totalsize = ""; $weight_tip = "";
@checked_terms = (); @plusf = (); @minusf = (); @otherf = (); @none = (); @final_files = ();
%stopwords = (); %minus = (); %matches = (); %score_numerator = (); %score_denominator = (); %finalscores = ();
$query_terms_copy = ""; $add_plus = ""; $all = ""; $whole_word = ""; $case_sensitive = ""; $search_title = "";
$search_meta_description = ""; $search_meta_keyword = ""; $search_meta_author = "";
$search_alt_text = ""; $search_body = ""; $search_links = ""; $search_url = "";
#if (@previous_queries && scalar@previous_queries < 7) { # to prevent looping too much
if (@previous_queries) { # search results of previous queries
$query_terms = shift @previous_queries;
$subsearch = 1;
} else { # search current query
$query_terms = $query->param($FORM_INPUT_NAME);
$query_terms =~ s/( )|( )/ /gs; # remove spaces
$query_terms = translate_characters($query_terms); # ISO Latin approximations
$bare_query_terms = $query_terms; # original query
$query_terms = 'all:'.$query_terms if $query->param('all') == 1;
$query_terms = 'c:'.$query_terms if ($query->param('c') eq "s" && $CASE_SENSITIVE);
$query_terms = 'w:'.$query_terms if $query->param('w') == 1;
$query_terms = 'st:'.$query_terms if ($query->param('st') == 1 && $ALL);
unless ($query->param('default') == 1) { # search content options
$query_terms = 'b:'.$query_terms if $query->param('b') == 1;
$query_terms = 't:'.$query_terms if ($query->param('t') == 1);
$query_terms = 'd:'.$query_terms if ($query->param('d') == 1 && $META_DESCRIPTION);
$query_terms = 'k:'.$query_terms if ($query->param('k') == 1 && $META_KEYWORD);
$query_terms = 'au:'.$query_terms if ($query->param('au') == 1 && $META_AUTHOR);
$query_terms = 'alt:'.$query_terms if ($query->param('alt') == 1 && $ALT_TEXT);
$query_terms = 'l:'.$query_terms if ($query->param('l') == 1 && $LINKS);
$query_terms = 'u:'.$query_terms if ($query->param('u') == 1 && $URL);
}
$query_terms =~ s/^\s+//;
$query_terms =~ s/\s+$//;
$query_terms =~ s/\s+/ /g;
$previous_query = $query_terms; # query with options for previous query option
$subsearch = "";
}
while ($query_terms =~ s/^(c|[0-9]+|score|date|match|size|title|name|b|t|d|k|au|alt|st|w|l|u|all)\://io) {
my $option = $1; # let user add options directly in query text field
$query_terms =~ s/^\s+//;
if ($option =~ /^c$/i && $CASE_SENSITIVE) { $case_sensitive = 1; next; }
if ($option =~ /^score$/i) { $SORT_BY = "Scores"; next; }
if ($option =~ /^date$/i) { $SORT_BY = "Dates"; next; }
if ($option =~ /^match$/i) { $SORT_BY = "Matches"; next; }
if ($option =~ /^size$/i) { $SORT_BY = "Sizes"; next; }
if ($option =~ /^title$/i) { $SORT_BY = "Titles"; next; }
if ($option =~ /^name$/i) { $SORT_BY = "File Names"; next; }
if ($option =~ /^b$/i) { $search_body = 1; next; }
if ($option =~ /^t$/i) { $search_title = 1; next; }
if ($option =~ /^d$/i && $META_DESCRIPTION) { $search_meta_description = 1; next; }
if ($option =~ /^k$/i && $META_KEYWORD) { $search_meta_keyword = 1; next; }
if ($option =~ /^au$/i && $META_AUTHOR) { $search_meta_author = 1; next; }
if ($option =~ /^alt$/i && $ALT_TEXT) { $search_alt_text = 1; next; }
if ($option =~ /^u$/i && $URL) { $search_url = 1; next; }
if ($option =~ /^l$/i && $LINKS) { $search_links = 1; next; }
if ($option =~ /^st$/i && $ALL) { $all = 1; next; }
if ($option =~ /^w$/i) { $whole_word = 1; next; }
if ($option =~ /^all$/i) { $add_plus = 1; next; }
if ($option =~ /^([0-9]+)$/) {
if ($option < 5) { $RESULTS_PER_PAGE = 5; }
elsif ($option > 100) { $RESULTS_PER_PAGE = 100; }
else { $RESULTS_PER_PAGE = $option; }
}
}
returnresults() if ($query->param('help') == 1 || $query_terms =~ /^\s*$/); # return page if no query or for help
if (!$search_title && !$search_meta_description && !$search_meta_keyword && !$search_meta_author && !$search_alt_text && !$search_body && !$search_links && !$search_url) {
$show_matches = $SHOW_MATCHES; $search_body = 1; $search_title = 1; $search_meta_description = 1; # search body, title, and meta description as default
}
my @phrases;
if ($DO_PHRASES) { # get phrases
while ($query_terms =~ s/(\+<[0-9]+>)\"([^\"]*)\"/ /) {
my $phrase = get_phrase($1,$2);
push @phrases, $phrase if $phrase;
}
while ($query_terms =~ s/(<[0-9]+>)\"([^\"]*)\"/ /) {
my $phrase = get_phrase($1,$2);
push @phrases, $phrase if $phrase;
}
while ($query_terms =~ s/(\+?)\"([^\"]*)\"/ /) {
my $phrase = get_phrase($1,$2);
push @phrases, $phrase if $phrase;
}
}
$query_terms =~ s/^\s+//;
$query_terms =~ s/\s+$//;
@terms = split /\s+/, $query_terms; # get terms
push @terms, @phrases if $DO_PHRASES; # append phrases to terms array
process_terms();
search_files() if (@otherf || @plusf || @minusf);
process_booleans();
get_sorted_answers();
}
sub get_phrase {
my ($boolean, $phrase) = @_;
$phrase =~ s/^\s+//;
$phrase =~ s/\s+$//;
return $boolean.$phrase if $phrase;
}
sub process_terms { # get terms and phrases and start search routine
my %terms;
foreach my $term (@terms) {
my $cp = $term;
my $cp_c;
$cp =~ s/^\+// if $cp ne '+'; # remove + boolean
if ($cp !~ /^<[0-9]+>$/ && $cp =~ m/^<([0-9]+)>/) {
if ($1 >= 2 && $1 <= 10000 && $USER_WEIGHTS) {
$cp =~ s/^<[0-9]+>//; # remove user defined weights
} elsif ($cp =~ / / && $USER_WEIGHTS) {
$weight_tip = "
Note: Scoring weights must be in the range of <2-10000>";
$cp =~ s/^<[0-9]+>//; # remove user defined weights
}
$cp_c = $cp;
$cp = lc $cp if !$case_sensitive;
next if exists $terms{$cp}; # skip repeats
$terms{$cp} = undef;
} else {
$cp_c = $cp;
$cp = lc $cp if !$case_sensitive;
next if exists $terms{$cp}; # skip repeats
$terms{$cp} = undef;
$cp =~ s/^\-// if $cp ne '-'; # remove - boolean
}
unless ($all || $cp =~ /^\S+\*$/) { # ignore stop terms
if (length $cp < $MIN_TERM_LENGTH || $cp =~ m/^$stopwords_regex$/io || $cp =~ m/^(<|>)$/) {
$query_terms_copy .= "$cp_c ";
$cp_c =~ s/^\-// if $cp_c ne '-'; # remove - boolean
$stopwords{$cp_c} = undef;
next;
}
}
if ($term ne '+' && $term =~ s/^\+//) {
@$term = ();
push @plusf, $term;
push @checked_terms, $cp_c;
$query_terms_copy .= ($cp_c =~ / / ? "+\"$cp_c\" " : "+$cp_c ");
} elsif ($term ne '-' && $term =~ s/^\-//) {
push @minusf, $term;
$query_terms_copy .= ($term =~ / / ? "-\"$term\" " : "-$term ");
} else {
if ($add_plus) {
@$term = ();
push @plusf, $term;
push @checked_terms, $cp_c;
$query_terms_copy .= ($cp_c =~ / / ? "+\"$cp_c\" " : "+$cp_c ");
} else {
push @otherf, $term;
push @checked_terms, $cp_c;
$query_terms_copy .= ($cp_c =~ / / ? "\"$cp_c\" " : "$cp_c ");
}
}
}
}
sub search_files {
if ($USE_DBM) {
while (($file, $file_path) = each(%f_file_db)) {
search_contents($file, $file_path);
}
} else {
my $file_count = 0;
open (FILEDB, $DATABASEFILE) || die "Can't open database file.\n";
foreach () {
$file_count++;
($f_file_db{$file_count}, $filenames_db{$file_count}, $f_date_db{$file_count},$f_size_db{$file_count},$f_termcount_db{$file_count},$descriptions_db{$file_count},$titles_db{$file_count},$contents_db{$file_count},$alt_text_db{$file_count},$meta_description_db{$file_count},$meta_keywords_db{$file_count},$meta_author_db{$file_count},$links_db{$file_count}) = split /\t/, $_;
my $filepath = $f_file_db{$file_count};
search_contents($file_count, $filepath);
}
close(FILEDB);
}
}
sub search_contents {
my $file = $_[0];
my $file_path = $_[1];
my $body;
if ($search_body) {
$score_denominator{$file} += $f_termcount_db{$file}; # add character count of body
if ($SAVE_CONTENT) { # search pre-processed files in database (faster but uses disk space)
if ($usehash) { # get contents from DBM if no size limits
$body = $contents_db{$file};
} else { # otherwise get contents from separate files
open (FILE,$DATABASE_DIR.$file) || die "Cannot open $DATABASE_DIR$file: $!";
$body = ;
close (FILE);
}
} else { # search html file directly (slower but saves disk space)
open (FILE,$INDEXER_START.$file_path) || die "Cannot open $INDEXER_START$file_path: $!";
my @LINES = ;
close (FILE);
$body = join ' ', @LINES;
# must clean contents and search larger file (slow part)
$body =~ s/(