MetaCast

24/04/2005

Situation

There once was a man for who I worked. He had an intranet full of HTMLs badly described and PDFs without specific properties because they were coming straight out of a mopier so with just images. He was on the verge of having a new search engine and wanted to make important architecture improvements. He came to the point of realizing that most of his files lacked meta-data, to the point that his brand new shiny system would be spoiled.

My idea was that before asking documentalist and content producers to add meta data where needed, we could use information in place to deduce a basic set of rules. A file location can be transformed into meta data because people choose, most of the time, their folder's name. When there's a link to a file, the anchored text provides interesting information about the target. The title of the parent file can be useful to it's son.

We'll first see a perl script to analyse content in place and determine how bad the meta situation is. Then we'll run the process of reading and updating meta-data in HTML and PDF files.

MetaStat

MetaStat is a simple tool. I browse recursively the HTML files inside a location and produces stats about the presence/lack of meta data. Harmlessly.

#!/usr/bin/perl
# All by HAbeTT
# meta data statistician
# 2005, Creative Commons, some rights reserved

use File::Find;

$| = true;

$hits = 0;

# directories to scan
find(\&process, "f:/habett");

# output results
print "\n\nMetaStat\n$hits files analyzed\n";

# title ratio
$mt = 100*$iztitle/$hits;
printf ("\n%02d percent titled\n", $mt);
# 5 most frequent titles
@basta = sort {$titles{$b} <=> $titles{$a}} keys %titles;
for $i (0..4) {
  print "  ".($i+1)." : \"".$basta[$i]."\" = ".$titles{$basta[$i]}."\n";
}

# description ratio
$md = 100*$izdescription/$hits;
printf ("\n%02d percent described\n", $md);
# 5 most frequent descriptions
@basta = sort {$descriptions{$b} <=> $descriptions{$a}} keys %descriptions;
for $i (0..4) {
  print "  ".($i+1)." : \"".$basta[$i]."\" = ".$descriptions{$basta[$i]}."\n";
}

# keywords ratio
$mk = 100*$izkeywords/$hits;
printf ("\n%02d percent keyworded\n", $mk);
# 5 most frequent keywords chains
@basta = sort {$keywords{$b} <=> $keywords{$a}} keys %keywords;
for $i (0..4) {
  print "  ".($i+1)." : \"".$basta[$i]."\" = ".$keywords{$basta[$i]}."\n";
}

exit(0);

sub process {
  # get filename
  $file = $File::Find::name;
  # eliminate directories
  return if (-d $file);
  # eliminate non html files
  return unless (substr($file,$perco) =~ /\.htm/io);
  # read file contents.
  open (TARGET, $file);
  $hits++;
  print ".";
  # read file contents
  $html = "";
  while ($p = read (TARGET,$donnees,8192)) {
    $html .= $donnees;
  }
  # close target
  close(TARGET);
  # parse meta data
  $title = "";
  ($title) = ($html =~ /.*<title>(.*)<\/title>.*/io);
  $titles{$title}++;
  $iztitle += 1 if ($html =~ /<title>/i);
  ($description) = ($html =~ /meta.*?description.*?content.*?=.*?"(.*?)"/io);
  $descriptions{$description}++;
  $izdescription += 1 if ($html =~ /<meta.*?description.*?content.*?>/i);
  ($keyword) = ($html =~ /meta.*?keywords.*?content.*?=.*?"(.*?)"/io);
  $keywords{$keyword}++;
  $izkeywords += 1 if ($html =~ /<meta.*?keywords.*?content.*?>/i);
}

As you can see, no exotic modules are used, it's just a matter of regexps and File::Find. Systemism pragma : auditor then actor.

The core

MetaCast is yet another recursive directory tree browser, a spider with the intend to build a comprehensive mesh. First it sights HTML files because it knows that it's where hyperlinks are. It grabs to parent's title and directory location because it believes it must be interesting intel. Then it loops through links.

For each link targeted to the current volume, it grabs the anchored text and the target directory if it exists. It'll include as potential keywords for the child the anchored text, the parent's title, the parent's folder and the child's folder.

If it's a PDF file, it'll use the very powerful PDF::API2 module to read and then write properties. It'll set the title, subject and it'll add the relevant keywords where needed.

If it's an HTML file, it'll use regexps to determine if there is, and then evaluate, title, description and keywords. It'll only modify the markup related to these meta datas, injection what he understood as relevant.

#!/usr/bin/perl
# All by HAbeTT
# Meta data processor
# 2005, Creative Commons, some rights reserved

use File::Find;
use PDF::API2;

# directories to scan
find(\&process, "/wwwroot");

exit(0);

sub process {
  # get filename
  $file = $File::Find::name;
  # eliminate directories
  return if (-d $file);
  # eliminate non html files
  return unless (substr($file,$perco) =~ /html?$/io);
  # read file contents.
  open (DAFILE, $file);
  print "$file\n";
  $code = "";
  while ($p = read(DAFILE,$donnees,8192)) {
    $code .= $donnees;
  }
  close(DAFILE);
  # grab parent directory name
  @cats = ();
  (@cats) = ($file =~ /\/(.*?)\//g);
  $parent = $cats[(scalar @cats)-1];
  # treat CR LF
  $html =~ s/(?:\012\015|\012|\015)/ /go;
  # get title
  ($titre) = ($code =~ /<title>(.*?)<\/title>/i);
  # get the links
  @links =();
  (@links) = ($code =~ /<a.*?href.*?=.*?"([^"]*)"/gi);
  #loop through links
  foreach $candid (@links) {
    # external links
    next if ($candid =~ /^http/o);
    # other volumes
    next if ($candid =~ /:/o);
    # get the anchor
    ($anchor) = ($code =~ /<a.*?href.*?=.*?"$candid".*?>(.*?\n?.*?)<\/a>/i);
    # get child
    @cats = ();
    (@cats) = ($candid =~ /\/?(.*?)\//g);
    $child = $cats[(scalar @cats)-1];
    # prepare keywords
    $seed = "$anchor $titre $parent $child";
    @seeds = split(/ /,$seed);
    if ($candid =~ /\.pdf$/i) {
      # links to PDF files
      # init PDF::API2
      $pdf = PDF::API2->open($candid) or die "Can't open $candid\n";
      print "=> $candid\n";
      # read metas
      %oj = $pdf->info();
      # update keywords from @seeds
      $keywords = $oj{'Keywords'};
      foreach $c (@seeds) {
        $keywords .= ", $c" unless ($keywords =~ /$c/);
      }
      # write metas
      $pdf->info('Title'=>$oj{'Title'}." ".$anchor,
                    'Subject'=>$oj{'Subject'}." ".$titre,
                    'Keywords'=>$keywords);
      # save
      $pdf->saveas($candid);
    } elsif ($candid =~ /\.html?$/i) {
      # links to HTML files
      # open target
      open (TARGET, $candid);
      print "=> $candid\n";
      # read file contents
      $html = "";
      while ($p = read (TARGET,$donnees,8192)) {
        $html .= $donnees;
      }
      # close target
      close(TARGET);
      # parse meta data
      $title = "";
      ($title) = ($html =~ /.*<title>(.*)<\/title>.*/io);
      $iztitle = ($html =~ /<title>/i);
      ($description) = ($html =~ /meta.*?description.*?content.*?=.*?"(.*?)"/io);
      $izdescription = ($html =~ /<meta.*?description.*?content.*?>/i);
      ($keywords) = ($html =~ /meta.*?keywords.*?content.*?=.*?"(.*?)"/io);
      $izkeywords = ($html =~ /<meta.*?keywords.*?content.*?>/i);
      # add keywords
      foreach $b (@seeds) {
        $keywords .= ", $b" unless ($keywords =~ /$b/i);
      }
      # unless set title to anchor name
      $title = $anchor unless ($title);
      # unless set description to parent name dash anchor
      $description = "$titre - $anchor" unless ($description);
      # set title
      if ($iztitle) {
        $html =~ s/<title>.*<\/title>/<title>$title<\/title>/i;
      } else {
        $html =~ s/<\/head/<title>$title<\/title>\n<\/head/i;
      }
      # set metas
      if ($izdescription) { $html =~ s/<meta.*?description.*?content.*?>//i; }
      $html =~ s/<\/head/<meta name="description" content="$description">\n<\/head/i;
      if ($izkeywords) { $html =~ s/<meta.*?keywords.*?content.*?>//i; }
      $html =~ s/<\/head/<meta name="keywords" content="$keywords">\n<\/head/i;
      # write target file
      open (TARGET, "> $candid") or die ("Pb mit $file");
      print TARGET $html;
      close (TARGET);
    }
  }
}

main menu