#!/usr/bin/perl -w use DBI; #connect to database my $dbh = DBI->connect('dbi:mysql:measure:mysql.ibiblio.org','user','pass'); #set variables my $dir= "wp"; #open directory containing files, get file list opendir(FOLDER, $dir); @subfiles = readdir(FOLDER); #loop through all the files foreach $file_name (@subfiles) { if ($file_name ne ".." && $file_name ne ".") { #format publication code from filename $code = substr($file_name,0,2) . "-" . substr($file_name,2,2) . "-" . substr($file_name,4,2); #open file, smash into one big string open(FILE, $dir . "/" . $file_name); $whole_file = join("",); # grab title between

tags @fields = split(/<\/?h1>/i, $whole_file); $title = clean_text($fields[1]); # if there is an author field, grab it if ($whole_file =~ /Authors?\:/) { @fields = split(/Authors?\:|Document Number\:/i, $whole_file); $authors = clean_text($fields[1]); } # if there is a date field, grab it if ($whole_file =~ /Date\:/) { @fields = split(/Date:|Abstract/i, $whole_file); $date = clean_text($fields[1]); } # grab abstract @fields =split(/

Abstract<\/H2>\s+|\s+
do($statement) or die "Can't execute $statement: $dbh- >errstr\n"; close(FILE); } } sub clean_text { $_ = $_[0]; s/<\/?(p|br|b|i|strong|em|h1|h2|h3)>/ /gi; #replace html tags with a single space s/\s+/ /ig; #replace any number of whitespace characters with a single space s/^ | $//ig;#trim spaces from the front and end of line s/\"/\\\"/g; # escape all double quotes; return $_; }