perkeep/indexing/basic/index.pl

108 lines
3.3 KiB
Perl
Executable File

#!/usr/bin/perl
#
# This is a basic indexing system for prototyping and demonstration
# purposes only.
#
# It's not production quality by any stretch of the imagination: it
# uses sqlite, is single-threaded, single-process, etc.
#
use strict;
use DBI;
use DBD::SQLite;
use Getopt::Long;
use LWP::UserAgent;
use LWP::ConnCache;
use JSON::Any;
use Net::Netrc;
my $blobserver;
my $dbname;
GetOptions(
"server=s" => \$blobserver,
"dbfile=s" => \$dbname,
) or usage();
usage() unless $blobserver && $blobserver =~ m!^(?:(https?)://)?([^/]+)!;
my $scheme = $1 || "http";
my $hostport = $2;
sub usage {
die
"Usage: index.pl --server=[http|https://]hostname[:port]\n" .
" --dbfile=<filename> (defaults to hostname_port.camlidx.db)\n";
}
my $netrc_mach = Net::Netrc->lookup($hostport)
or die "No ~/.netrc file or section for host '$hostport'.\n";
unless ($dbname) {
$dbname = "$hostport.camlidx.db";
$dbname =~ s/:/_/;
}
my $needs_init = ! -f $dbname || -s $dbname == 0;
my $db = DBI->connect("dbi:SQLite:dbname=$dbname", "", "", { RaiseError => 1 })
or die "Failed to open sqlite file $dbname";
if ($needs_init) {
$db->do("CREATE TABLE blobs (" .
" blobref VARCHAR(80) NOT NULL PRIMARY KEY, " .
" size INT NULL, " .
" mimetype VARCHAR(30) NULL)");
}
my $ua = LWP::UserAgent->new(
agent => "Camlistore/BasicIndexer",
conn_cache => LWP::ConnCache->new,
);
my $json = JSON::Any->new;
# TODO: remove hard-coded realm
$ua->credentials($hostport, "camlistored", "user", $netrc_mach->password);
my $after = "";
my $n_blobs = 0;
while (1) {
print "Enumerating starting at: $after ... ($n_blobs blobs so far)\n";
my $res = $ua->get("$scheme://$hostport/camli/enumerate-blobs?after=$after&limit=5000");
unless ($res->is_success) {
die "Failure from /camli/enumerate-blobs?after=$after: " . $res->status_line;
}
my $jres = $json->jsonToObj($res->content);
my $bloblist = $jres->{'blobs'};
if (ref($bloblist) eq "ARRAY") {
my $first = $bloblist->[0]{'blobRef'};
my $last = $bloblist->[-1]{'blobRef'};
my $sth = $db->prepare("SELECT blobref, size, mimetype FROM blobs WHERE " .
"blobref >= ? AND blobref <= ?");
$sth->execute($first, $last);
my %inventory; # blobref -> [$size, $bool_have_mime]
while (my ($lblob, $lsize, $lmimetype) = $sth->fetchrow_array) {
$inventory{$lblob} = [$lsize, defined($lmimetype)];
}
foreach my $blob (@$bloblist) {
$n_blobs++;
my $lblob = $inventory{$blob->{'blobRef'}};
if (!$lblob) {
print "Inserting $blob->{'blobRef'} ...\n";
$db->do("INSERT INTO blobs (blobref, size) VALUES (?, ?)", undef,
$blob->{'blobRef'}, $blob->{'size'});
next;
}
if ($lblob && !$lblob->[0] && $blob->{'size'}) {
print "Updating size of $blob->{'blobRef'} ...\n";
$db->do("UPDATE blobs SET size=? WHERE blobref=?", undef,
$blob->{'size'}, $blob->{'blobRef'});
next;
}
}
}
last unless $jres->{'after'} && $jres->{'after'} gt $after;
$after = $jres->{'after'};
}
print "$n_blobs blobs indexed.\n";