basic mimetype indexing

This commit is contained in:
Brad Fitzpatrick 2010-10-11 21:26:09 -07:00
parent a85141a5d2
commit e0323ca8a3
1 changed files with 59 additions and 2 deletions

View File

@ -47,6 +47,7 @@ if ($needs_init) {
$db->do("CREATE TABLE blobs (" .
" blobref VARCHAR(80) NOT NULL PRIMARY KEY, " .
" size INT NULL, " .
" headbytes VARCHAR(1024) NULL, ".
" mimetype VARCHAR(30) NULL)");
}
@ -61,10 +62,11 @@ my $json = JSON::Any->new;
$ua->credentials($hostport, "camlistored", "user", $netrc_mach->password);
print "Iterating over blobs.\n";
my $n_blobs = learn_blob_digests_and_sizes();
my $n_blobs = populate_blob_digests_and_sizes();
print "Number of blobs: $n_blobs.\n";
populate_blob_types();
sub learn_blob_digests_and_sizes {
sub populate_blob_digests_and_sizes {
my $after = "";
my $n_blobs = 0;
while (1) {
@ -110,3 +112,58 @@ sub learn_blob_digests_and_sizes {
}
return $n_blobs;
}
sub populate_blob_types {
my $after = "";
my $n_blobs = 0;
while (1) {
print "Querying for un-sniffed blobs after '$after'...\n";
my $sth = $db->prepare("SELECT blobref, size, headbytes FROM blobs WHERE " .
"mimetype IS NULL AND size IS NOT NULL AND blobref > ? LIMIT 50");
$sth->execute($after);
my $cursor_count = 0;
while (my ($blobref, $size, $headbytes) = $sth->fetchrow_array) {
$after = $blobref;
$cursor_count++;
my $need_headbytes_update = 0;
$headbytes = "" if defined($size) && $size == 0;
if (defined $headbytes) {
print "Unknown type for $blobref ...\n";
} else {
print "Fetching $blobref ...\n";
my $req = HTTP::Request->new(GET => "$scheme://$hostport/camli/$blobref");
$req->header("Range" => "bytes=0-1024");
my $res = $ua->request($req);
unless ($res->is_success) {
die "Failure fetching head of /camli/$blobref: " . $res->status_line;
}
$headbytes = $res->content;
$need_headbytes_update = 1;
my $size = length($headbytes);
print "Fetching $blobref = $size byte header\n";
}
my $type = get_type_from_magic($headbytes);
next unless $type or $need_headbytes_update;
print "Type of $blobref: $type\n";
$db->do("UPDATE blobs SET headbytes=?, mimetype=? WHERE blobref=?", undef,
$headbytes, $type, $blobref);
}
print "count: $cursor_count\n";
last unless $cursor_count > 0;
}
}
sub get_type_from_magic {
my $magic = shift;
if ($magic =~ /^{.+"camliVersion"/s) {
return "application/json+camli";
}
if ($magic =~ /^\xff\xd8\xff\xe1/) {
return "image/jpeg";
}
if ($magic =~ /^<\?xml\b.+<gpx\b/s) { # TODO: over-broad
return "application/gpx+xml"; # not actually registered?
}
return undef;
}