diff --git a/indexing/basic/index.pl b/indexing/basic/index.pl index b40996d56..e13484739 100755 --- a/indexing/basic/index.pl +++ b/indexing/basic/index.pl @@ -47,6 +47,7 @@ if ($needs_init) { $db->do("CREATE TABLE blobs (" . " blobref VARCHAR(80) NOT NULL PRIMARY KEY, " . " size INT NULL, " . + " headbytes VARCHAR(1024) NULL, ". " mimetype VARCHAR(30) NULL)"); } @@ -61,10 +62,11 @@ my $json = JSON::Any->new; $ua->credentials($hostport, "camlistored", "user", $netrc_mach->password); print "Iterating over blobs.\n"; -my $n_blobs = learn_blob_digests_and_sizes(); +my $n_blobs = populate_blob_digests_and_sizes(); print "Number of blobs: $n_blobs.\n"; +populate_blob_types(); -sub learn_blob_digests_and_sizes { +sub populate_blob_digests_and_sizes { my $after = ""; my $n_blobs = 0; while (1) { @@ -110,3 +112,58 @@ sub learn_blob_digests_and_sizes { } return $n_blobs; } + +sub populate_blob_types { + my $after = ""; + my $n_blobs = 0; + while (1) { + print "Querying for un-sniffed blobs after '$after'...\n"; + my $sth = $db->prepare("SELECT blobref, size, headbytes FROM blobs WHERE " . + "mimetype IS NULL AND size IS NOT NULL AND blobref > ? LIMIT 50"); + $sth->execute($after); + my $cursor_count = 0; + while (my ($blobref, $size, $headbytes) = $sth->fetchrow_array) { + $after = $blobref; + $cursor_count++; + my $need_headbytes_update = 0; + $headbytes = "" if defined($size) && $size == 0; + if (defined $headbytes) { + print "Unknown type for $blobref ...\n"; + } else { + print "Fetching $blobref ...\n"; + my $req = HTTP::Request->new(GET => "$scheme://$hostport/camli/$blobref"); + $req->header("Range" => "bytes=0-1024"); + my $res = $ua->request($req); + unless ($res->is_success) { + die "Failure fetching head of /camli/$blobref: " . $res->status_line; + } + $headbytes = $res->content; + $need_headbytes_update = 1; + my $size = length($headbytes); + print "Fetching $blobref = $size byte header\n"; + } + my $type = get_type_from_magic($headbytes); + next unless $type or $need_headbytes_update; + print "Type of $blobref: $type\n"; + $db->do("UPDATE blobs SET headbytes=?, mimetype=? WHERE blobref=?", undef, + $headbytes, $type, $blobref); + } + print "count: $cursor_count\n"; + last unless $cursor_count > 0; + } +} + +sub get_type_from_magic { + my $magic = shift; + if ($magic =~ /^{.+"camliVersion"/s) { + return "application/json+camli"; + } + if ($magic =~ /^\xff\xd8\xff\xe1/) { + return "image/jpeg"; + } + if ($magic =~ /^<\?xml\b.+