#!/usr/local/bin/perl # Copyright © 2001 by James F. Carter # Permission is granted for any use provided this copyright notice # remains intact. Specifically, if you want to hack this for another language, # feel free. # Issues: # % Suppress 1-letter "words". # . Unsuppress letterals. # . Provide for incoming and outgoing xrefs. # . Multi-word queries # . Order the hits by file position. # % What is "modal~test" doing as a key? (from English) # % What is 2.4.5,4.6.3 doing as a key? # % Only kill keys from stoppable areas. (Leave g-vo). # % Killing seems not to actually kill anything. # CGI script to search the xankua database. # Put this in your HTML document (suitably formatted) (note the field names): #
# The "referer" field is for the URL of the original referrer, i.e. the form # that called this script, in case this script calls itself multiple times. # The program attempts to include this referrer document at the end of the # output, provided a file of the same basename exists in the same directory as # the script. # The database is sought in the same directory where the script is. # If a command line argument of -R is given, the database is rebuilt. # Filenames: # xankua.dat Text table # xankua.{dir,pag} DBM version of database # As presently programmed, the file has 1756 lines (including thesaurus # categories), 12452 keys, 21528 targets. # Command line arguments (must be in this order): # -f basename Basename of database (for debug) # -R Rebuild database $basename = "xankua"; if ($ARGV[0] eq "-f") { $basename = $ARGV[1]; splice(@ARGV, 0, 2); } $rebuild = ($ARGV[0] eq "-R"); # Compute the name of the database $cgi = ($ENV{GATEWAY_INTERFACE} ne ''); $script = $cgi ? $ENV{SCRIPT_FILENAME} : $0 =~ /\// ? $0 : "./$0"; $dir = $script; substr($dir, rindex($dir, '/')) = ''; $dbname = "$dir/$basename"; # Redirect stderr to stdout. open(STDERR, ">&STDOUT") or die "Can't redirect STDERR to STDOUT: $!\n" if $cgi; # Compute name of the HTML file. $htmlfile = $script; substr($htmlfile, rindex($htmlfile, '.')) = ".html"; open(DATA, $htmlfile) || die "Failed to open HTML file $htmlfile: $!\n" if -r $htmlfile && !$rebuild; # Print standard header area. Print HTML file up to separator. if (!$rebuild) { print "Content-type: text/html\n\n"; $sep = "\n"; while () { print; last if $_ eq $sep; } } # Open the database. open(DB, "$dir/$basename.dat") or die "Can't open $dir/$basename.dat: $!\n"; use SDBM_File; #Do it here so error messages are printed use Fcntl; $DB = "$dir/$basename"; @DB = ("$DB.dir", "$DB.pag"); unlink @DB if $rebuild; # tie(%DB, 'GDBM_File', $DB, # ($rebuild ? &GDBM_WRCREAT : &GDBM_READER), 0644) tie(%DB, 'SDBM_File', $DB, ($rebuild ? (O_RDWR | O_CREAT) : O_RDONLY), 0644) or die "Can't open database $DB: $!\n"; # Timing data (on 500 MHz Pentium III): GDBM, ODBM, NDBM appear # to be identically implemented. SDBM is slightly different. # Data about GDBM SDBM # Words 1756 1756 (Base file records) # Keys 12147 12147 (DB records) # Targets 20531 20531 (Seek ptrs in DB recs) # Base file size 187520 187520 bytes # DBM file size 991232 863232 bytes # Time to rebuild 3.53 3.01 secs # Time 1.72e-4 1.46e-4 sec/target # Time to rebuild the database? &rebuild() if $rebuild; # Extract the query parameters into hash %q. # HTTP encoding changes spaces to + signs; change it back. # No need to translate other special codes. foreach $unit (split(/\&/, $ENV{QUERY_STRING})) { ($k, $v) = split("=", $unit); $q{$k} = &decodevalue($v); } $q{referer} = $ENV{HTTP_REFERER} unless $q{referer} ne ''; # Which areas are to be searched? BEGIN { %area = qw(g word t thes e engl d definition x xref); %areax = (g => "-Gua\\spi", x => "Cross references", d => "Definition", e => "English keywords", t => "Thesaurus category", a => "All fields"); } foreach $e (qw(g x d e t a)) { next if $q{$e} eq ''; push(@area, $e); push(@areax, $areax{$e}); } $areax = join(', ', @areax); unless (@area) { print "No area was selected; using word and definition.\n";
@area = qw(g d);
$areax = "(Missing)";
}
# Print the query as received.
print < Maximum results of `$q{max}' must be > 0; using 10.\n";
$q{max} = 10;
}
unless ($q{word} ne '') {
print " Search word is null. No results.\n";
&finish();
}
# Regexp to recognize the sought words
$regexp = quotemeta(join(' ', &normalize($q{word})));
$regexp =~ tr/ /|/;
# Do the queries. The subrt knows to skip duplicates.
QUERY: foreach $w (split(/[,\s]+/, $q{word})) {
foreach $e (@area) {
foreach $posn (split(' ', $DB{"$e$w"})) {
&doquery($posn) or last QUERY;
}
}
}
# If cross references were requested, do that now.
if ($q{xref} ne '') {
my %xref;
while (($posn, $rec) = each(%hits)) {
foreach $k (split(/[,\s]+/, $rec->{xref})) {
$xref{$k}++;
}
}
XREF: foreach $w (keys %xref) {
foreach $posn (split(' ', $DB{"g$w"})) {
&doquery($posn) or last XREF;
}
}
}
# Guts of the query process. Argument: a seek position.
sub doquery {
my($posn) = @_;
if (!exists($hits{$posn})) {
seek(DB, $posn, 0) or do {
print STDERR " Failed to seek to `$posn': $!\n";
return 0;
};
$_ = readline(DB); # Read the record for that word.
# Matching words in bold face
s/\b($regexp)\b/$1<\/B>/igo if $regexp ne '';
$hits{$posn} = &munpack($_);
}
$hits{$posn}{score}++;
1;
}
# Organize the results, most hits first, then ordered by file
# position, except thesaurus categories first.
@hits = sort {
($hits{$b}{class} eq 't') <=> ($hits{$a}{class} eq 't') ||
$hits{$b}{score} <=> $hits{$a}{score} ||
$a <=> $b
} keys %hits;
printf " %d matching records found.\n", scalar(@hits);
if (@hits > $q{max}) {
print "Showing the first $q{max}.\n";
splice(@hits, $q{max});
}
# Provide the column headers (if there are any results)
if (@hits > 0) {
unshift(@hits, -1);
$hits{"-1"} = &munpack("Word\tCl\tRank\tCategory\tEnglish\tPhonetic\t \t \t \t\t\t\n");
}
# Report the results.
print 'Query:
Word or Category $q{word}
Area to Search $areax
Max Results $q{max}
EOF
unless ($q{max} > 0) {
print "
\n$emsg";
&finish();
# End of main thread.
# Appends the rest of the form to the output, and exits.
sub finish {
while () {
s/value=\".*\"/value=\"$q{referer}\"/ if /name=referer/;
print;
}
exit 0;
}
# HTML-encodes values that may contain control characters.
sub xformvalue {
my($value) = @_;
my($unit, $c);
#Translate all control characters except ^J (\n)
my(@value) = split(/([\0-\011\013-\037])/, $value);
$value = '';
while (($unit, $c) = splice(@value, 0, 2)) {
$value .= $unit;
if ($c ne '') {
$value .= '^';
$value .= chr(ord($c)+0100);
}
}
return $value;
}
# Inverts the HTTP-encoding process.
sub decodevalue {
my($val) = @_;
$val =~ tr/+/ /; # Spaces are changed to + signs.
my @val = split('%', $val);
my @v2 = shift @val;
foreach $_ (@val) {
substr($_, 0, 2) = chr(oct("0x" . substr($_, 0, 2)));v
}
join('', @v2, @val);
}
# Unpack a database record. Args: \%record, $data.
BEGIN {
@fields = qw(word class rank thes engl phon chinese latin loglan xref definition comments);
# word -gua\spi word
# class
# P = Phrase relative pronoun (modal pronouns are S's), Letteral
# S = Pure structure word (caselink, digit, etc.)
# p = A noun in English
# q = Relational, but not a verb in English
# r = Relation
# t = Category header
# rank (not used)
# thes Thesaurus category
# engl Comma separated English keywords
# phon Phonetic English (for making words)a
# chinese Phonetic Chinese (for making words)a
# latin Phonetic Latin (for making words)a
# loglan Old Loglan word
# xref Cross reference to other -gua\spi words
# definition
# comments
}
sub munpack {
my($data) = @_;
chomp $data;
my $rec = { };
@{$rec}{@fields} = split("\t", $data);
$rec;
}
# Splits the argument into words after doing these transformations:
# Arguments starting with X are removed. Everything becomes lower case.
# Punctuation (e.g. tone symbols in Chinese) is removed.
# $bits: 1 = letteral (don't lose 1-byte codes); 2 = thesaurus category
# (don't lose punctuation).
sub normalize {
my($data, $area) = @_;
$data =~ s/X\S+//g; # Lose arguments (but not letteral X)
$data = lc($data); # Everything becomes lower case
$data =~ tr/,~/ /; # Change comma and tilde into word separators
unless ($area eq 't') {
$data =~ s/[^\w\s]//g; # Lose other punctuation
$data =~ s/\b\d+\b//g; # Lose trashed thesaurus categories
$data =~ s/\b\w\b// unless $area eq 'd'; #Lose 1-byte words
}
split(' ', $data); # Split, losing all whitespace
}
# Rebuilds the database.
# (guaspi, cognates, definition, category, all)
sub rebuild {
my($k, $t, $j);
my %stop = qw(is 1 a 1 the 1 and 1 by 1 to 1 of 1 in 1 for 1 case 1 at 1 that 1 as 1 about 1 do 1 but 1 as 1 no 1 can 1 it 1 has 1 are 1 eg 1 its 1 ); #Stop list
my %stopk = qw(d 1 c 1 a 1); # Stops apply to these areas
my $nkeys = 0; # Number of keys
my $ntgts = 0; # Number of targets
my $dots = 50; # Print a dot every N lines
my $lines = 0; # Number of words indexed
select STDERR;
print STDERR "Rebuilding database: \n";
$| = 1;
my($posn) = tell DB;
while (${$rec}{word} ${$rec}{thes} (${$rec}{class}) ${$rec}{engl} $xl
EOF
print " ${$rec}{definition}\n" if ${$rec}{definition} ne '';
print " Note: ${$rec}{comments}\n" if ${$rec}{comments} ne '';
print " See also: ${$rec}{xref}\n" if ${$rec}{xref} ne '';
}
print "