There once was a man for who I worked. He had an intranet full of HTMLs badly described and PDFs without specific properties because they were coming straight out of a mopier so with just images. He was on the verge of having a new search engine and wanted to make important architecture improvements. He came to the point of realizing that most of his files lacked meta-data, to the point that his brand new shiny system would be spoiled.
My idea was that before asking documentalist and content producers to add meta data where needed, we could use information in place to deduce a basic set of rules. A file location can be transformed into meta data because people choose, most of the time, their folder's name. When there's a link to a file, the anchored text provides interesting information about the target. The title of the parent file can be useful to it's son.
We'll first see a perl script to analyse content in place and determine how bad the meta situation is. Then we'll run the process of reading and updating meta-data in HTML and PDF files.
MetaStat is a simple tool. I browse recursively the HTML files inside a location and produces stats about the presence/lack of meta data. Harmlessly.
#!/usr/bin/perl
# All by HAbeTT
# meta data statistician
# 2005, Creative Commons, some rights reserved
use File::Find;
$| = true;
$hits = 0;
# directories to scan
find(\&process, "f:/habett");
# output results
print "\n\nMetaStat\n$hits files analyzed\n";
# title ratio
$mt = 100*$iztitle/$hits;
printf ("\n%02d percent titled\n", $mt);
# 5 most frequent titles
@basta = sort {$titles{$b} <=> $titles{$a}} keys %titles;
for $i (0..4) {
print " ".($i+1)." : \"".$basta[$i]."\" = ".$titles{$basta[$i]}."\n";
}
# description ratio
$md = 100*$izdescription/$hits;
printf ("\n%02d percent described\n", $md);
# 5 most frequent descriptions
@basta = sort {$descriptions{$b} <=> $descriptions{$a}} keys %descriptions;
for $i (0..4) {
print " ".($i+1)." : \"".$basta[$i]."\" = ".$descriptions{$basta[$i]}."\n";
}
# keywords ratio
$mk = 100*$izkeywords/$hits;
printf ("\n%02d percent keyworded\n", $mk);
# 5 most frequent keywords chains
@basta = sort {$keywords{$b} <=> $keywords{$a}} keys %keywords;
for $i (0..4) {
print " ".($i+1)." : \"".$basta[$i]."\" = ".$keywords{$basta[$i]}."\n";
}
exit(0);
sub process {
# get filename
$file = $File::Find::name;
# eliminate directories
return if (-d $file);
# eliminate non html files
return unless (substr($file,$perco) =~ /\.htm/io);
# read file contents.
open (TARGET, $file);
$hits++;
print ".";
# read file contents
$html = "";
while ($p = read (TARGET,$donnees,8192)) {
$html .= $donnees;
}
# close target
close(TARGET);
# parse meta data
$title = "";
($title) = ($html =~ /.*<title>(.*)<\/title>.*/io);
$titles{$title}++;
$iztitle += 1 if ($html =~ /<title>/i);
($description) = ($html =~ /meta.*?description.*?content.*?=.*?"(.*?)"/io);
$descriptions{$description}++;
$izdescription += 1 if ($html =~ /<meta.*?description.*?content.*?>/i);
($keyword) = ($html =~ /meta.*?keywords.*?content.*?=.*?"(.*?)"/io);
$keywords{$keyword}++;
$izkeywords += 1 if ($html =~ /<meta.*?keywords.*?content.*?>/i);
}
As you can see, no exotic modules are used, it's just a matter of regexps and File::Find. Systemism pragma : auditor then actor.
MetaCast is yet another recursive directory tree browser, a spider with the intend to build a comprehensive mesh. First it sights HTML files because it knows that it's where hyperlinks are. It grabs to parent's title and directory location because it believes it must be interesting intel. Then it loops through links.
For each link targeted to the current volume, it grabs the anchored text and the target directory if it exists. It'll include as potential keywords for the child the anchored text, the parent's title, the parent's folder and the child's folder.
If it's a PDF file, it'll use the very powerful PDF::API2 module to read and then write properties. It'll set the title, subject and it'll add the relevant keywords where needed.
If it's an HTML file, it'll use regexps to determine if there is, and then evaluate, title, description and keywords. It'll only modify the markup related to these meta datas, injection what he understood as relevant.
#!/usr/bin/perl
# All by HAbeTT
# Meta data processor
# 2005, Creative Commons, some rights reserved
use File::Find;
use PDF::API2;
# directories to scan
find(\&process, "/wwwroot");
exit(0);
sub process {
# get filename
$file = $File::Find::name;
# eliminate directories
return if (-d $file);
# eliminate non html files
return unless (substr($file,$perco) =~ /html?$/io);
# read file contents.
open (DAFILE, $file);
print "$file\n";
$code = "";
while ($p = read(DAFILE,$donnees,8192)) {
$code .= $donnees;
}
close(DAFILE);
# grab parent directory name
@cats = ();
(@cats) = ($file =~ /\/(.*?)\//g);
$parent = $cats[(scalar @cats)-1];
# treat CR LF
$html =~ s/(?:\012\015|\012|\015)/ /go;
# get title
($titre) = ($code =~ /<title>(.*?)<\/title>/i);
# get the links
@links =();
(@links) = ($code =~ /<a.*?href.*?=.*?"([^"]*)"/gi);
#loop through links
foreach $candid (@links) {
# external links
next if ($candid =~ /^http/o);
# other volumes
next if ($candid =~ /:/o);
# get the anchor
($anchor) = ($code =~ /<a.*?href.*?=.*?"$candid".*?>(.*?\n?.*?)<\/a>/i);
# get child
@cats = ();
(@cats) = ($candid =~ /\/?(.*?)\//g);
$child = $cats[(scalar @cats)-1];
# prepare keywords
$seed = "$anchor $titre $parent $child";
@seeds = split(/ /,$seed);
if ($candid =~ /\.pdf$/i) {
# links to PDF files
# init PDF::API2
$pdf = PDF::API2->open($candid) or die "Can't open $candid\n";
print "=> $candid\n";
# read metas
%oj = $pdf->info();
# update keywords from @seeds
$keywords = $oj{'Keywords'};
foreach $c (@seeds) {
$keywords .= ", $c" unless ($keywords =~ /$c/);
}
# write metas
$pdf->info('Title'=>$oj{'Title'}." ".$anchor,
'Subject'=>$oj{'Subject'}." ".$titre,
'Keywords'=>$keywords);
# save
$pdf->saveas($candid);
} elsif ($candid =~ /\.html?$/i) {
# links to HTML files
# open target
open (TARGET, $candid);
print "=> $candid\n";
# read file contents
$html = "";
while ($p = read (TARGET,$donnees,8192)) {
$html .= $donnees;
}
# close target
close(TARGET);
# parse meta data
$title = "";
($title) = ($html =~ /.*<title>(.*)<\/title>.*/io);
$iztitle = ($html =~ /<title>/i);
($description) = ($html =~ /meta.*?description.*?content.*?=.*?"(.*?)"/io);
$izdescription = ($html =~ /<meta.*?description.*?content.*?>/i);
($keywords) = ($html =~ /meta.*?keywords.*?content.*?=.*?"(.*?)"/io);
$izkeywords = ($html =~ /<meta.*?keywords.*?content.*?>/i);
# add keywords
foreach $b (@seeds) {
$keywords .= ", $b" unless ($keywords =~ /$b/i);
}
# unless set title to anchor name
$title = $anchor unless ($title);
# unless set description to parent name dash anchor
$description = "$titre - $anchor" unless ($description);
# set title
if ($iztitle) {
$html =~ s/<title>.*<\/title>/<title>$title<\/title>/i;
} else {
$html =~ s/<\/head/<title>$title<\/title>\n<\/head/i;
}
# set metas
if ($izdescription) { $html =~ s/<meta.*?description.*?content.*?>//i; }
$html =~ s/<\/head/<meta name="description" content="$description">\n<\/head/i;
if ($izkeywords) { $html =~ s/<meta.*?keywords.*?content.*?>//i; }
$html =~ s/<\/head/<meta name="keywords" content="$keywords">\n<\/head/i;
# write target file
open (TARGET, "> $candid") or die ("Pb mit $file");
print TARGET $html;
close (TARGET);
}
}
}