mirror of https://github.com/perkeep/perkeep.git
basic mimetype indexing
This commit is contained in:
parent
a85141a5d2
commit
e0323ca8a3
|
@ -47,6 +47,7 @@ if ($needs_init) {
|
|||
$db->do("CREATE TABLE blobs (" .
|
||||
" blobref VARCHAR(80) NOT NULL PRIMARY KEY, " .
|
||||
" size INT NULL, " .
|
||||
" headbytes VARCHAR(1024) NULL, ".
|
||||
" mimetype VARCHAR(30) NULL)");
|
||||
}
|
||||
|
||||
|
@ -61,10 +62,11 @@ my $json = JSON::Any->new;
|
|||
$ua->credentials($hostport, "camlistored", "user", $netrc_mach->password);
|
||||
|
||||
print "Iterating over blobs.\n";
|
||||
my $n_blobs = learn_blob_digests_and_sizes();
|
||||
my $n_blobs = populate_blob_digests_and_sizes();
|
||||
print "Number of blobs: $n_blobs.\n";
|
||||
populate_blob_types();
|
||||
|
||||
sub learn_blob_digests_and_sizes {
|
||||
sub populate_blob_digests_and_sizes {
|
||||
my $after = "";
|
||||
my $n_blobs = 0;
|
||||
while (1) {
|
||||
|
@ -110,3 +112,58 @@ sub learn_blob_digests_and_sizes {
|
|||
}
|
||||
return $n_blobs;
|
||||
}
|
||||
|
||||
sub populate_blob_types {
|
||||
my $after = "";
|
||||
my $n_blobs = 0;
|
||||
while (1) {
|
||||
print "Querying for un-sniffed blobs after '$after'...\n";
|
||||
my $sth = $db->prepare("SELECT blobref, size, headbytes FROM blobs WHERE " .
|
||||
"mimetype IS NULL AND size IS NOT NULL AND blobref > ? LIMIT 50");
|
||||
$sth->execute($after);
|
||||
my $cursor_count = 0;
|
||||
while (my ($blobref, $size, $headbytes) = $sth->fetchrow_array) {
|
||||
$after = $blobref;
|
||||
$cursor_count++;
|
||||
my $need_headbytes_update = 0;
|
||||
$headbytes = "" if defined($size) && $size == 0;
|
||||
if (defined $headbytes) {
|
||||
print "Unknown type for $blobref ...\n";
|
||||
} else {
|
||||
print "Fetching $blobref ...\n";
|
||||
my $req = HTTP::Request->new(GET => "$scheme://$hostport/camli/$blobref");
|
||||
$req->header("Range" => "bytes=0-1024");
|
||||
my $res = $ua->request($req);
|
||||
unless ($res->is_success) {
|
||||
die "Failure fetching head of /camli/$blobref: " . $res->status_line;
|
||||
}
|
||||
$headbytes = $res->content;
|
||||
$need_headbytes_update = 1;
|
||||
my $size = length($headbytes);
|
||||
print "Fetching $blobref = $size byte header\n";
|
||||
}
|
||||
my $type = get_type_from_magic($headbytes);
|
||||
next unless $type or $need_headbytes_update;
|
||||
print "Type of $blobref: $type\n";
|
||||
$db->do("UPDATE blobs SET headbytes=?, mimetype=? WHERE blobref=?", undef,
|
||||
$headbytes, $type, $blobref);
|
||||
}
|
||||
print "count: $cursor_count\n";
|
||||
last unless $cursor_count > 0;
|
||||
}
|
||||
}
|
||||
|
||||
sub get_type_from_magic {
|
||||
my $magic = shift;
|
||||
if ($magic =~ /^{.+"camliVersion"/s) {
|
||||
return "application/json+camli";
|
||||
}
|
||||
if ($magic =~ /^\xff\xd8\xff\xe1/) {
|
||||
return "image/jpeg";
|
||||
}
|
||||
if ($magic =~ /^<\?xml\b.+<gpx\b/s) { # TODO: over-broad
|
||||
return "application/gpx+xml"; # not actually registered?
|
||||
}
|
||||
return undef;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue