2014-03-15 01:14:18 +00:00
|
|
|
/*
|
|
|
|
Copyright 2014 The Camlistore Authors
|
|
|
|
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
you may not use this file except in compliance with the License.
|
|
|
|
You may obtain a copy of the License at
|
|
|
|
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
limitations under the License.
|
|
|
|
*/
|
|
|
|
|
|
|
|
// Package twitter implements a twitter.com importer.
|
|
|
|
package twitter
|
|
|
|
|
|
|
|
import (
|
2014-06-13 23:35:16 +00:00
|
|
|
"archive/zip"
|
|
|
|
"bytes"
|
|
|
|
"encoding/json"
|
2014-03-15 01:14:18 +00:00
|
|
|
"errors"
|
|
|
|
"fmt"
|
2014-06-13 23:35:16 +00:00
|
|
|
"io/ioutil"
|
2014-03-15 01:14:18 +00:00
|
|
|
"log"
|
|
|
|
"net/http"
|
|
|
|
"net/url"
|
2014-06-17 01:32:05 +00:00
|
|
|
"os"
|
|
|
|
"path"
|
2014-06-19 23:23:10 +00:00
|
|
|
"regexp"
|
2014-03-15 01:14:18 +00:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
2014-06-13 23:35:16 +00:00
|
|
|
"sync"
|
2014-03-15 01:14:18 +00:00
|
|
|
"time"
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
"camlistore.org/pkg/blob"
|
2014-03-21 19:27:06 +00:00
|
|
|
"camlistore.org/pkg/context"
|
2014-03-15 01:14:18 +00:00
|
|
|
"camlistore.org/pkg/httputil"
|
|
|
|
"camlistore.org/pkg/importer"
|
|
|
|
"camlistore.org/pkg/schema"
|
2014-06-13 23:35:16 +00:00
|
|
|
"camlistore.org/pkg/syncutil"
|
2014-03-15 01:14:18 +00:00
|
|
|
"camlistore.org/third_party/github.com/garyburd/go-oauth/oauth"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2014-04-18 22:21:59 +00:00
|
|
|
apiURL = "https://api.twitter.com/1.1/"
|
|
|
|
temporaryCredentialRequestURL = "https://api.twitter.com/oauth/request_token"
|
|
|
|
resourceOwnerAuthorizationURL = "https://api.twitter.com/oauth/authorize"
|
|
|
|
tokenRequestURL = "https://api.twitter.com/oauth/access_token"
|
|
|
|
userInfoAPIPath = "account/verify_credentials.json"
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
// runCompleteVersion is a cache-busting version number of the
|
|
|
|
// importer code. It should be incremented whenever the
|
|
|
|
// behavior of this importer is updated enough to warrant a
|
|
|
|
// complete run. Otherwise, if the importer runs to
|
|
|
|
// completion, this version number is recorded on the account
|
|
|
|
// permanode and subsequent importers can stop early.
|
2014-06-16 22:09:22 +00:00
|
|
|
runCompleteVersion = "4"
|
2014-06-13 23:35:16 +00:00
|
|
|
|
2014-04-25 17:47:35 +00:00
|
|
|
// TODO(mpl): refactor these 4 below into an oauth package when doing flickr.
|
|
|
|
acctAttrTempToken = "oauthTempToken"
|
|
|
|
acctAttrTempSecret = "oauthTempSecret"
|
|
|
|
acctAttrAccessToken = "oauthAccessToken"
|
|
|
|
acctAttrAccessTokenSecret = "oauthAccessTokenSecret"
|
2014-04-18 22:21:59 +00:00
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
// acctAttrTweetZip specifies an optional attribte for the account permanode.
|
|
|
|
// If set, it should be of a "file" schema blob referencing the tweets.zip
|
|
|
|
// file that Twitter makes available for the full archive download.
|
|
|
|
// The Twitter API doesn't go back forever in time, so if you started using
|
|
|
|
// the Camlistore importer too late, you need to "camput file tweets.zip"
|
|
|
|
// once downloading it from Twitter, and then:
|
|
|
|
// $ camput attr <acct-permanode> twitterArchiveZipFileRef <zip-fileref>
|
|
|
|
// ... and re-do an import.
|
|
|
|
acctAttrTweetZip = "twitterArchiveZipFileRef"
|
|
|
|
|
2014-06-13 23:52:23 +00:00
|
|
|
// acctAttrZipDoneVersion is updated at the end of a successful zip import and
|
2014-06-13 23:35:16 +00:00
|
|
|
// is used to determine whether the zip file needs to be re-imported in a future run.
|
2014-06-13 23:52:23 +00:00
|
|
|
acctAttrZipDoneVersion = "twitterZipDoneVersion" // == "<fileref>:<runCompleteVersion>"
|
2014-06-13 23:35:16 +00:00
|
|
|
|
2014-06-16 22:09:22 +00:00
|
|
|
// Per-tweet note of how we imported it: either "zip" or "api"
|
|
|
|
attrImportMethod = "twitterImportMethod"
|
|
|
|
|
2014-03-15 01:14:18 +00:00
|
|
|
tweetRequestLimit = 200 // max number of tweets we can get in a user_timeline request
|
2014-06-13 23:35:16 +00:00
|
|
|
tweetsAtOnce = 20 // how many tweets to import at once
|
2014-03-15 01:14:18 +00:00
|
|
|
)
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func init() {
|
|
|
|
importer.Register("twitter", &imp{})
|
|
|
|
}
|
|
|
|
|
|
|
|
var _ importer.ImporterSetupHTMLer = (*imp)(nil)
|
|
|
|
|
2014-05-14 23:09:20 +00:00
|
|
|
type imp struct {
|
|
|
|
importer.OAuth1 // for CallbackRequestAccount and CallbackURLParameters
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (im *imp) NeedsAPIKey() bool { return true }
|
|
|
|
|
|
|
|
func (im *imp) IsAccountReady(acctNode *importer.Object) (ok bool, err error) {
|
2014-06-13 23:35:16 +00:00
|
|
|
if acctNode.Attr(importer.AcctAttrUserID) != "" && acctNode.Attr(acctAttrAccessToken) != "" {
|
2014-04-18 22:21:59 +00:00
|
|
|
return true, nil
|
|
|
|
}
|
|
|
|
return false, nil
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (im *imp) SummarizeAccount(acct *importer.Object) string {
|
|
|
|
ok, err := im.IsAccountReady(acct)
|
|
|
|
if err != nil {
|
|
|
|
return "Not configured; error = " + err.Error()
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-04-18 22:21:59 +00:00
|
|
|
if !ok {
|
|
|
|
return "Not configured"
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-06-13 23:35:16 +00:00
|
|
|
s := fmt.Sprintf("@%s (%s), twitter id %s",
|
|
|
|
acct.Attr(importer.AcctAttrUserName),
|
|
|
|
acct.Attr(importer.AcctAttrName),
|
|
|
|
acct.Attr(importer.AcctAttrUserID),
|
|
|
|
)
|
|
|
|
if acct.Attr(acctAttrTweetZip) != "" {
|
|
|
|
s += " + zip file"
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-06-13 23:35:16 +00:00
|
|
|
return s
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (im *imp) AccountSetupHTML(host *importer.Host) string {
|
|
|
|
base := host.ImporterBaseURL() + "twitter"
|
|
|
|
return fmt.Sprintf(`
|
|
|
|
<h1>Configuring Twitter</h1>
|
|
|
|
<p>Visit <a href='https://apps.twitter.com/'>https://apps.twitter.com/</a> and click "Create New App".</p>
|
|
|
|
<p>Use the following settings:</p>
|
|
|
|
<ul>
|
|
|
|
<li>Name: Does not matter. (camlistore-importer).</li>
|
|
|
|
<li>Description: Does not matter. (imports twitter data into camlistore).</li>
|
|
|
|
<li>Website: <b>%s</b></li>
|
|
|
|
<li>Callback URL: <b>%s</b></li>
|
|
|
|
</ul>
|
|
|
|
<p>Click "Create your Twitter application".You should be redirected to the Application Management page of your newly created application.
|
|
|
|
</br>Go to the API Keys tab. Copy the "API key" and "API secret" into the "Client ID" and "Client Secret" boxes above.</p>
|
|
|
|
`, base, base+"/callback")
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
// A run is our state for a given run of the importer.
|
|
|
|
type run struct {
|
|
|
|
*importer.RunContext
|
2014-04-25 17:47:35 +00:00
|
|
|
im *imp
|
2014-06-13 23:35:16 +00:00
|
|
|
incremental bool // whether we've completed a run in the past
|
|
|
|
|
2014-04-25 17:47:35 +00:00
|
|
|
oauthClient *oauth.Client // No need to guard, used read-only.
|
|
|
|
accessCreds *oauth.Credentials // No need to guard, used read-only.
|
2014-06-13 23:35:16 +00:00
|
|
|
|
2014-06-16 22:09:22 +00:00
|
|
|
mu sync.Mutex // guards anyErr
|
|
|
|
anyErr bool
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-05-14 21:34:43 +00:00
|
|
|
func (r *run) oauthContext() oauthContext {
|
|
|
|
return oauthContext{r.Context, r.oauthClient, r.accessCreds}
|
|
|
|
}
|
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
var forceFullImport, _ = strconv.ParseBool(os.Getenv("CAMLI_TWITTER_FULL_IMPORT"))
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (im *imp) Run(ctx *importer.RunContext) error {
|
2014-04-25 17:47:35 +00:00
|
|
|
clientId, secret, err := ctx.Credentials()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("no API credentials: %v", err)
|
|
|
|
}
|
2014-06-13 23:52:23 +00:00
|
|
|
acctNode := ctx.AccountNode()
|
|
|
|
accessToken := acctNode.Attr(acctAttrAccessToken)
|
|
|
|
accessSecret := acctNode.Attr(acctAttrAccessTokenSecret)
|
2014-04-25 17:47:35 +00:00
|
|
|
if accessToken == "" || accessSecret == "" {
|
|
|
|
return errors.New("access credentials not found")
|
|
|
|
}
|
2014-04-18 22:21:59 +00:00
|
|
|
r := &run{
|
2014-06-13 23:35:16 +00:00
|
|
|
RunContext: ctx,
|
|
|
|
im: im,
|
2014-06-17 01:32:05 +00:00
|
|
|
incremental: !forceFullImport && acctNode.Attr(importer.AcctAttrCompletedVersion) == runCompleteVersion,
|
2014-06-13 23:35:16 +00:00
|
|
|
|
2014-04-25 17:47:35 +00:00
|
|
|
oauthClient: &oauth.Client{
|
|
|
|
TemporaryCredentialRequestURI: temporaryCredentialRequestURL,
|
|
|
|
ResourceOwnerAuthorizationURI: resourceOwnerAuthorizationURL,
|
|
|
|
TokenRequestURI: tokenRequestURL,
|
|
|
|
Credentials: oauth.Credentials{
|
|
|
|
Token: clientId,
|
|
|
|
Secret: secret,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
accessCreds: &oauth.Credentials{
|
|
|
|
Token: accessToken,
|
|
|
|
Secret: accessSecret,
|
|
|
|
},
|
2014-04-18 22:21:59 +00:00
|
|
|
}
|
2014-06-13 23:52:23 +00:00
|
|
|
|
|
|
|
userID := acctNode.Attr(importer.AcctAttrUserID)
|
2014-04-18 22:21:59 +00:00
|
|
|
if userID == "" {
|
|
|
|
return errors.New("UserID hasn't been set by account setup.")
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
skipAPITweets, _ := strconv.ParseBool(os.Getenv("CAMLI_TWITTER_SKIP_API_IMPORT"))
|
|
|
|
if !skipAPITweets {
|
|
|
|
if err := r.importTweets(userID); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-06-13 23:35:16 +00:00
|
|
|
|
2014-06-13 23:52:23 +00:00
|
|
|
zipRef := acctNode.Attr(acctAttrTweetZip)
|
|
|
|
zipDoneVal := zipRef + ":" + runCompleteVersion
|
|
|
|
if zipRef != "" && !(r.incremental && acctNode.Attr(acctAttrZipDoneVersion) == zipDoneVal) {
|
2014-06-13 23:35:16 +00:00
|
|
|
zipbr, ok := blob.Parse(zipRef)
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("invalid zip file blobref %q", zipRef)
|
|
|
|
}
|
|
|
|
fr, err := schema.NewFileReader(r.Host.BlobSource(), zipbr)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error opening zip %v: %v", zipbr, err)
|
|
|
|
}
|
|
|
|
defer fr.Close()
|
|
|
|
zr, err := zip.NewReader(fr, fr.Size())
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Error opening twitter zip file %v: %v", zipRef, err)
|
|
|
|
}
|
|
|
|
if err := r.importTweetsFromZip(userID, zr); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-06-13 23:52:23 +00:00
|
|
|
if err := acctNode.SetAttrs(acctAttrZipDoneVersion, zipDoneVal); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-06-13 23:35:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
r.mu.Lock()
|
|
|
|
anyErr := r.anyErr
|
|
|
|
r.mu.Unlock()
|
|
|
|
|
|
|
|
if !anyErr {
|
2014-06-13 23:52:23 +00:00
|
|
|
if err := acctNode.SetAttrs(importer.AcctAttrCompletedVersion, runCompleteVersion); err != nil {
|
2014-06-13 23:35:16 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-03-15 01:14:18 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
func (r *run) errorf(format string, args ...interface{}) {
|
|
|
|
log.Printf(format, args...)
|
|
|
|
r.mu.Lock()
|
|
|
|
defer r.mu.Unlock()
|
|
|
|
r.anyErr = true
|
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (r *run) importTweets(userID string) error {
|
2014-03-15 01:14:18 +00:00
|
|
|
maxId := ""
|
|
|
|
continueRequests := true
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
tweetsNode, err := r.getTopLevelNode("tweets", "Tweets")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
numTweets := 0
|
2014-06-16 22:09:22 +00:00
|
|
|
sawTweet := map[string]bool{}
|
2014-06-13 23:35:16 +00:00
|
|
|
|
2014-03-15 01:14:18 +00:00
|
|
|
for continueRequests {
|
2014-04-18 22:21:59 +00:00
|
|
|
if r.Context.IsCanceled() {
|
2014-06-13 23:35:16 +00:00
|
|
|
r.errorf("Twitter importer: interrupted")
|
2014-03-21 19:27:06 +00:00
|
|
|
return context.ErrCanceled
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
var resp []*apiTweetItem
|
2014-06-13 23:35:16 +00:00
|
|
|
log.Printf("Fetching tweets for userid %s with max ID %q", userID, maxId)
|
2014-05-14 21:34:43 +00:00
|
|
|
if err := r.oauthContext().doAPI(&resp, "statuses/user_timeline.json",
|
2014-04-18 22:21:59 +00:00
|
|
|
"user_id", userID,
|
|
|
|
"count", strconv.Itoa(tweetRequestLimit),
|
|
|
|
"max_id", maxId); err != nil {
|
2014-03-15 01:14:18 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
var (
|
2014-06-16 22:09:22 +00:00
|
|
|
newThisBatch = 0
|
|
|
|
allDupMu sync.Mutex
|
|
|
|
allDups = true
|
|
|
|
gate = syncutil.NewGate(tweetsAtOnce)
|
|
|
|
grp syncutil.Group
|
2014-06-13 23:35:16 +00:00
|
|
|
)
|
|
|
|
for i := range resp {
|
|
|
|
tweet := resp[i]
|
2014-06-16 22:09:22 +00:00
|
|
|
|
|
|
|
// Dup-suppression.
|
|
|
|
if sawTweet[tweet.Id] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
sawTweet[tweet.Id] = true
|
|
|
|
newThisBatch++
|
|
|
|
maxId = tweet.Id
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
gate.Start()
|
|
|
|
grp.Go(func() error {
|
|
|
|
defer gate.Done()
|
2014-06-16 22:09:22 +00:00
|
|
|
dup, err := r.importTweet(tweetsNode, tweet, true)
|
2014-06-13 23:35:16 +00:00
|
|
|
if !dup {
|
|
|
|
allDupMu.Lock()
|
|
|
|
allDups = false
|
|
|
|
allDupMu.Unlock()
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
r.errorf("Twitter importer: error importing tweet %s %v", tweet.Id, err)
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
})
|
|
|
|
}
|
|
|
|
if err := grp.Err(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2014-06-16 22:09:22 +00:00
|
|
|
numTweets += newThisBatch
|
|
|
|
log.Printf("Imported %d tweets this batch; %d total.", newThisBatch, numTweets)
|
2014-06-13 23:35:16 +00:00
|
|
|
if r.incremental && allDups {
|
|
|
|
log.Printf("twitter incremental import found end batch")
|
|
|
|
break
|
|
|
|
}
|
2014-06-16 22:09:22 +00:00
|
|
|
continueRequests = newThisBatch > 0
|
2014-06-13 23:35:16 +00:00
|
|
|
}
|
|
|
|
log.Printf("Successfully did full run of importing %d tweets", numTweets)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
func tweetsFromZipFile(zf *zip.File) (tweets []*zipTweetItem, err error) {
|
2014-06-13 23:35:16 +00:00
|
|
|
rc, err := zf.Open()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
slurp, err := ioutil.ReadAll(rc)
|
|
|
|
rc.Close()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
i := bytes.IndexByte(slurp, '[')
|
|
|
|
if i < 0 {
|
|
|
|
return nil, errors.New("No '[' found in zip file")
|
|
|
|
}
|
|
|
|
slurp = slurp[i:]
|
|
|
|
if err := json.Unmarshal(slurp, &tweets); err != nil {
|
|
|
|
return nil, fmt.Errorf("JSON error: %v", err)
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *run) importTweetsFromZip(userID string, zr *zip.Reader) error {
|
|
|
|
log.Printf("Processing zip file with %d files", len(zr.File))
|
|
|
|
|
|
|
|
tweetsNode, err := r.getTopLevelNode("tweets", "Tweets")
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
gate = syncutil.NewGate(tweetsAtOnce)
|
|
|
|
grp syncutil.Group
|
|
|
|
)
|
2014-06-16 22:09:22 +00:00
|
|
|
total := 0
|
2014-06-13 23:35:16 +00:00
|
|
|
for _, zf := range zr.File {
|
|
|
|
if !(strings.HasPrefix(zf.Name, "data/js/tweets/2") && strings.HasSuffix(zf.Name, ".js")) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
tweets, err := tweetsFromZipFile(zf)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error reading tweets from %s: %v", zf.Name, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for i := range tweets {
|
|
|
|
total++
|
|
|
|
tweet := tweets[i]
|
|
|
|
gate.Start()
|
|
|
|
grp.Go(func() error {
|
|
|
|
defer gate.Done()
|
2014-06-16 22:09:22 +00:00
|
|
|
_, err := r.importTweet(tweetsNode, tweet, false)
|
2014-06-13 23:35:16 +00:00
|
|
|
return err
|
|
|
|
})
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
}
|
2014-06-13 23:35:16 +00:00
|
|
|
err = grp.Err()
|
2014-06-16 22:09:22 +00:00
|
|
|
log.Printf("zip import of tweets: %d total, err = %v", total, err)
|
2014-06-13 23:35:16 +00:00
|
|
|
return err
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
func timeParseFirstFormat(timeStr string, format ...string) (t time.Time, err error) {
|
|
|
|
if len(format) == 0 {
|
|
|
|
panic("need more than 1 format")
|
|
|
|
}
|
|
|
|
for _, f := range format {
|
|
|
|
t, err = time.Parse(f, timeStr)
|
|
|
|
if err == nil {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-06-16 22:09:22 +00:00
|
|
|
// viaAPI is true if it came via the REST API, or false if it came via a zip file.
|
2014-06-17 01:32:05 +00:00
|
|
|
func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool) (dup bool, err error) {
|
2014-06-13 23:35:16 +00:00
|
|
|
if r.Context.IsCanceled() {
|
|
|
|
r.errorf("Twitter importer: interrupted")
|
|
|
|
return false, context.ErrCanceled
|
|
|
|
}
|
2014-06-17 01:32:05 +00:00
|
|
|
id := tweet.ID()
|
|
|
|
tweetNode, err := parent.ChildPathObject(id)
|
2014-03-15 01:14:18 +00:00
|
|
|
if err != nil {
|
2014-06-13 23:35:16 +00:00
|
|
|
return false, err
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-06-19 23:23:10 +00:00
|
|
|
|
|
|
|
// Because the zip format and the API format differ a bit, and
|
|
|
|
// might diverge more in the future, never use the zip content
|
|
|
|
// to overwrite data fetched via the API. If we add new
|
|
|
|
// support for different fields in the future, we might want
|
|
|
|
// to revisit this decision. Be wary of flip/flopping data if
|
|
|
|
// modifying this, though.
|
2014-06-16 22:09:22 +00:00
|
|
|
if tweetNode.Attr(attrImportMethod) == "api" && !viaAPI {
|
|
|
|
return true, nil
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
// e.g. "2014-06-12 19:11:51 +0000"
|
2014-06-17 01:32:05 +00:00
|
|
|
createdTime, err := timeParseFirstFormat(tweet.CreatedAt(), time.RubyDate, "2006-01-02 15:04:05 -0700")
|
2014-03-15 01:14:18 +00:00
|
|
|
if err != nil {
|
2014-06-17 01:32:05 +00:00
|
|
|
return false, fmt.Errorf("could not parse time %q: %v", tweet.CreatedAt(), err)
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-06-13 23:35:16 +00:00
|
|
|
url := fmt.Sprintf("https://twitter.com/%s/status/%v",
|
|
|
|
r.AccountNode().Attr(importer.AcctAttrUserName),
|
2014-06-17 01:32:05 +00:00
|
|
|
id)
|
|
|
|
|
2014-06-14 21:37:41 +00:00
|
|
|
attrs := []string{
|
2014-06-17 01:32:05 +00:00
|
|
|
"twitterId", id,
|
2014-03-15 01:14:18 +00:00
|
|
|
"camliNodeType", "twitter.com:tweet",
|
2014-06-13 23:35:16 +00:00
|
|
|
importer.AttrStartDate, schema.RFC3339FromTime(createdTime),
|
2014-06-17 01:32:05 +00:00
|
|
|
"content", tweet.Text(),
|
2014-06-13 23:35:16 +00:00
|
|
|
importer.AttrURL, url,
|
2014-06-14 21:37:41 +00:00
|
|
|
}
|
|
|
|
if lat, long, ok := tweet.LatLong(); ok {
|
|
|
|
attrs = append(attrs,
|
|
|
|
"latitude", fmt.Sprint(lat),
|
|
|
|
"longitude", fmt.Sprint(long),
|
|
|
|
)
|
|
|
|
}
|
2014-06-16 22:09:22 +00:00
|
|
|
if viaAPI {
|
|
|
|
attrs = append(attrs, attrImportMethod, "api")
|
|
|
|
} else {
|
|
|
|
attrs = append(attrs, attrImportMethod, "zip")
|
|
|
|
}
|
2014-06-17 01:32:05 +00:00
|
|
|
|
|
|
|
for i, m := range tweet.Media() {
|
|
|
|
filename := m.BaseFilename()
|
|
|
|
if tweetNode.Attr("camliPath:"+filename) != "" && (i > 0 || tweetNode.Attr("camliContentImage") != "") {
|
|
|
|
// Don't re-import media we've already fetched.
|
|
|
|
continue
|
|
|
|
}
|
2014-06-19 23:23:10 +00:00
|
|
|
tried, gotMedia := 0, false
|
|
|
|
for _, mediaURL := range m.URLs() {
|
|
|
|
tried++
|
|
|
|
res, err := r.HTTPClient().Get(mediaURL)
|
|
|
|
if err != nil {
|
2014-07-01 09:46:45 +00:00
|
|
|
return false, fmt.Errorf("Error fetching %s for tweet %s : %v", mediaURL, url, err)
|
2014-06-19 23:23:10 +00:00
|
|
|
}
|
|
|
|
if res.StatusCode == http.StatusNotFound {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if res.StatusCode != 200 {
|
|
|
|
return false, fmt.Errorf("HTTP status %s fetching %s for tweet %s", res.StatusCode, mediaURL, url)
|
|
|
|
}
|
|
|
|
if !viaAPI {
|
|
|
|
log.Printf("For zip tweet %s, reading %v", url, mediaURL)
|
|
|
|
}
|
|
|
|
fileRef, err := schema.WriteFileFromReader(r.Host.Target(), filename, res.Body)
|
|
|
|
res.Body.Close()
|
|
|
|
if err != nil {
|
|
|
|
return false, fmt.Errorf("Error fetching media %s for tweet %s: %v", mediaURL, url, err)
|
|
|
|
}
|
|
|
|
attrs = append(attrs, "camliPath:"+filename, fileRef.String())
|
|
|
|
if i == 0 {
|
|
|
|
attrs = append(attrs, "camliContentImage", fileRef.String())
|
|
|
|
}
|
|
|
|
log.Printf("Slurped %s as %s for tweet %s (%v)", mediaURL, fileRef.String(), url, tweetNode.PermanodeRef())
|
|
|
|
gotMedia = true
|
|
|
|
break
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
2014-06-19 23:23:10 +00:00
|
|
|
if !gotMedia && tried > 0 {
|
|
|
|
return false, fmt.Errorf("All media URLs 404s for tweet %s", url)
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-06-14 21:37:41 +00:00
|
|
|
changes, err := tweetNode.SetAttrs2(attrs...)
|
2014-06-13 23:35:16 +00:00
|
|
|
if err == nil && changes {
|
|
|
|
log.Printf("Imported tweet %s", url)
|
|
|
|
}
|
|
|
|
return !changes, err
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (r *run) getTopLevelNode(path string, title string) (*importer.Object, error) {
|
|
|
|
tweets, err := r.RootNode().ChildPathObject(path)
|
2014-03-15 01:14:18 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2014-04-18 22:21:59 +00:00
|
|
|
if err := tweets.SetAttr("title", title); err != nil {
|
2014-03-15 01:14:18 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2014-04-18 22:21:59 +00:00
|
|
|
return tweets, nil
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
// TODO(mpl): move to an api.go when we it gets bigger.
|
|
|
|
|
|
|
|
type userInfo struct {
|
|
|
|
ID string `json:"id_str"`
|
|
|
|
ScreenName string `json:"screen_name"`
|
|
|
|
Name string `json:"name,omitempty"`
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-05-14 21:34:43 +00:00
|
|
|
func getUserInfo(ctx oauthContext) (userInfo, error) {
|
2014-04-18 22:21:59 +00:00
|
|
|
var ui userInfo
|
2014-05-14 21:34:43 +00:00
|
|
|
if err := ctx.doAPI(&ui, userInfoAPIPath); err != nil {
|
2014-04-18 22:21:59 +00:00
|
|
|
return ui, err
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-04-18 22:21:59 +00:00
|
|
|
if ui.ID == "" {
|
|
|
|
return ui, fmt.Errorf("No userid returned")
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-04-18 22:21:59 +00:00
|
|
|
return ui, nil
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-25 17:47:35 +00:00
|
|
|
func newOauthClient(ctx *importer.SetupContext) (*oauth.Client, error) {
|
2014-04-18 22:21:59 +00:00
|
|
|
clientId, secret, err := ctx.Credentials()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-04-25 17:47:35 +00:00
|
|
|
return &oauth.Client{
|
|
|
|
TemporaryCredentialRequestURI: temporaryCredentialRequestURL,
|
|
|
|
ResourceOwnerAuthorizationURI: resourceOwnerAuthorizationURL,
|
|
|
|
TokenRequestURI: tokenRequestURL,
|
|
|
|
Credentials: oauth.Credentials{
|
|
|
|
Token: clientId,
|
|
|
|
Secret: secret,
|
|
|
|
},
|
2014-04-18 22:21:59 +00:00
|
|
|
}, nil
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (im *imp) ServeSetup(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) error {
|
2014-04-25 17:47:35 +00:00
|
|
|
oauthClient, err := newOauthClient(ctx)
|
2014-03-15 01:14:18 +00:00
|
|
|
if err != nil {
|
2014-04-25 17:47:35 +00:00
|
|
|
err = fmt.Errorf("error getting OAuth client: %v", err)
|
2014-04-18 22:21:59 +00:00
|
|
|
httputil.ServeError(w, r, err)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
tempCred, err := oauthClient.RequestTemporaryCredentials(ctx.HTTPClient(), ctx.CallbackURL(), nil)
|
|
|
|
if err != nil {
|
|
|
|
err = fmt.Errorf("Error getting temp cred: %v", err)
|
|
|
|
httputil.ServeError(w, r, err)
|
2014-04-25 17:47:35 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
if err := ctx.AccountNode.SetAttrs(
|
|
|
|
acctAttrTempToken, tempCred.Token,
|
|
|
|
acctAttrTempSecret, tempCred.Secret,
|
|
|
|
); err != nil {
|
|
|
|
err = fmt.Errorf("Error saving temp creds: %v", err)
|
|
|
|
httputil.ServeError(w, r, err)
|
|
|
|
return err
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
authURL := oauthClient.AuthorizationURL(tempCred, nil)
|
|
|
|
http.Redirect(w, r, authURL, 302)
|
2014-04-18 22:21:59 +00:00
|
|
|
return nil
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
|
|
|
|
2014-04-18 22:21:59 +00:00
|
|
|
func (im *imp) ServeCallback(w http.ResponseWriter, r *http.Request, ctx *importer.SetupContext) {
|
2014-04-25 17:47:35 +00:00
|
|
|
tempToken := ctx.AccountNode.Attr(acctAttrTempToken)
|
|
|
|
tempSecret := ctx.AccountNode.Attr(acctAttrTempSecret)
|
|
|
|
if tempToken == "" || tempSecret == "" {
|
|
|
|
log.Printf("twitter: no temp creds in callback")
|
|
|
|
httputil.BadRequestError(w, "no temp creds in callback")
|
2014-04-23 15:54:35 +00:00
|
|
|
return
|
|
|
|
}
|
2014-04-25 17:47:35 +00:00
|
|
|
if tempToken != r.FormValue("oauth_token") {
|
|
|
|
log.Printf("unexpected oauth_token: got %v, want %v", r.FormValue("oauth_token"), tempToken)
|
2014-04-18 22:21:59 +00:00
|
|
|
httputil.BadRequestError(w, "unexpected oauth_token")
|
2014-03-15 01:14:18 +00:00
|
|
|
return
|
|
|
|
}
|
2014-04-25 17:47:35 +00:00
|
|
|
oauthClient, err := newOauthClient(ctx)
|
|
|
|
if err != nil {
|
|
|
|
err = fmt.Errorf("error getting OAuth client: %v", err)
|
|
|
|
httputil.ServeError(w, r, err)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
tokenCred, vals, err := oauthClient.RequestToken(
|
|
|
|
ctx.Context.HTTPClient(),
|
|
|
|
&oauth.Credentials{
|
|
|
|
Token: tempToken,
|
|
|
|
Secret: tempSecret,
|
|
|
|
},
|
|
|
|
r.FormValue("oauth_verifier"),
|
|
|
|
)
|
2014-03-15 01:14:18 +00:00
|
|
|
if err != nil {
|
2014-04-18 22:21:59 +00:00
|
|
|
httputil.ServeError(w, r, fmt.Errorf("Error getting request token: %v ", err))
|
2014-03-15 01:14:18 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
userid := vals.Get("user_id")
|
|
|
|
if userid == "" {
|
2014-04-18 22:21:59 +00:00
|
|
|
httputil.ServeError(w, r, fmt.Errorf("Couldn't get user id: %v", err))
|
2014-03-15 01:14:18 +00:00
|
|
|
return
|
|
|
|
}
|
2014-04-25 17:47:35 +00:00
|
|
|
if err := ctx.AccountNode.SetAttrs(
|
|
|
|
acctAttrAccessToken, tokenCred.Token,
|
|
|
|
acctAttrAccessTokenSecret, tokenCred.Secret,
|
|
|
|
); err != nil {
|
|
|
|
httputil.ServeError(w, r, fmt.Errorf("Error setting token attributes: %v", err))
|
|
|
|
return
|
|
|
|
}
|
2014-03-15 01:14:18 +00:00
|
|
|
|
2014-05-14 21:34:43 +00:00
|
|
|
u, err := getUserInfo(oauthContext{ctx.Context, oauthClient, tokenCred})
|
2014-04-18 22:21:59 +00:00
|
|
|
if err != nil {
|
|
|
|
httputil.ServeError(w, r, fmt.Errorf("Couldn't get user info: %v", err))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if err := ctx.AccountNode.SetAttrs(
|
2014-06-13 23:35:16 +00:00
|
|
|
importer.AcctAttrUserID, u.ID,
|
|
|
|
importer.AcctAttrName, u.Name,
|
|
|
|
importer.AcctAttrUserName, u.ScreenName,
|
2014-04-18 22:21:59 +00:00
|
|
|
); err != nil {
|
|
|
|
httputil.ServeError(w, r, fmt.Errorf("Error setting attribute: %v", err))
|
|
|
|
return
|
|
|
|
}
|
|
|
|
http.Redirect(w, r, ctx.AccountURL(), http.StatusFound)
|
2014-03-15 01:14:18 +00:00
|
|
|
}
|
2014-05-14 21:34:43 +00:00
|
|
|
|
|
|
|
// oauthContext is used as a value type, wrapping a context and oauth information.
|
|
|
|
//
|
|
|
|
// TODO: move this up to pkg/importer?
|
|
|
|
type oauthContext struct {
|
|
|
|
*context.Context
|
|
|
|
client *oauth.Client
|
|
|
|
creds *oauth.Credentials
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ctx oauthContext) doAPI(result interface{}, apiPath string, keyval ...string) error {
|
|
|
|
if len(keyval)%2 == 1 {
|
|
|
|
panic("Incorrect number of keyval arguments. must be even.")
|
|
|
|
}
|
|
|
|
form := url.Values{}
|
|
|
|
for i := 0; i < len(keyval); i += 2 {
|
|
|
|
if keyval[i+1] != "" {
|
|
|
|
form.Set(keyval[i], keyval[i+1])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fullURL := apiURL + apiPath
|
|
|
|
res, err := ctx.doGet(fullURL, form)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
err = httputil.DecodeJSON(res, result)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("could not parse response for %s: %v", fullURL, err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ctx oauthContext) doGet(url string, form url.Values) (*http.Response, error) {
|
|
|
|
if ctx.creds == nil {
|
|
|
|
return nil, errors.New("No OAuth credentials. Not logged in?")
|
|
|
|
}
|
|
|
|
if ctx.client == nil {
|
|
|
|
return nil, errors.New("No OAuth client.")
|
|
|
|
}
|
|
|
|
res, err := ctx.client.Get(ctx.HTTPClient(), ctx.creds, url, form)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("Error fetching %s: %v", url, err)
|
|
|
|
}
|
|
|
|
if res.StatusCode != http.StatusOK {
|
|
|
|
return nil, fmt.Errorf("Get request on %s failed with: %s", url, res.Status)
|
|
|
|
}
|
|
|
|
return res, nil
|
|
|
|
}
|
2014-06-14 21:37:41 +00:00
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
type tweetItem interface {
|
|
|
|
ID() string
|
|
|
|
LatLong() (lat, long float64, ok bool)
|
|
|
|
CreatedAt() string
|
|
|
|
Text() string
|
|
|
|
Media() []tweetMedia
|
|
|
|
}
|
|
|
|
|
|
|
|
type tweetMedia interface {
|
2014-06-19 23:23:10 +00:00
|
|
|
URLs() []string // use first non-404 one
|
2014-06-17 01:32:05 +00:00
|
|
|
BaseFilename() string
|
|
|
|
}
|
|
|
|
|
|
|
|
type apiTweetItem struct {
|
|
|
|
Id string `json:"id_str"`
|
|
|
|
TextStr string `json:"text"`
|
|
|
|
CreatedAtStr string `json:"created_at"`
|
|
|
|
Entities entities `json:"entities"`
|
2014-06-14 21:37:41 +00:00
|
|
|
|
|
|
|
// One or both might be present:
|
|
|
|
Geo *geo `json:"geo"` // lat, long
|
|
|
|
Coordinates *coords `json:"coordinates"` // geojson: long, lat
|
|
|
|
}
|
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
// zipTweetItem is like apiTweetItem, but twitter is annoying and the schema for the JSON inside zip files is slightly different.
|
|
|
|
type zipTweetItem struct {
|
|
|
|
Id string `json:"id_str"`
|
|
|
|
TextStr string `json:"text"`
|
|
|
|
CreatedAtStr string `json:"created_at"`
|
|
|
|
|
|
|
|
// One or both might be present:
|
2014-06-19 23:23:10 +00:00
|
|
|
Geo *geo `json:"geo"` // lat, long
|
|
|
|
Coordinates *coords `json:"coordinates"` // geojson: long, lat
|
|
|
|
Entities zipEntities `json:"entities"`
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (t *apiTweetItem) ID() string {
|
|
|
|
if t.Id == "" {
|
|
|
|
panic("empty id")
|
|
|
|
}
|
|
|
|
return t.Id
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *zipTweetItem) ID() string {
|
|
|
|
if t.Id == "" {
|
|
|
|
panic("empty id")
|
|
|
|
}
|
|
|
|
return t.Id
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *apiTweetItem) CreatedAt() string { return t.CreatedAtStr }
|
|
|
|
func (t *zipTweetItem) CreatedAt() string { return t.CreatedAtStr }
|
|
|
|
|
|
|
|
func (t *apiTweetItem) Text() string { return t.TextStr }
|
|
|
|
func (t *zipTweetItem) Text() string { return t.TextStr }
|
|
|
|
|
|
|
|
func (t *apiTweetItem) LatLong() (lat, long float64, ok bool) {
|
|
|
|
return latLong(t.Geo, t.Coordinates)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (t *zipTweetItem) LatLong() (lat, long float64, ok bool) {
|
|
|
|
return latLong(t.Geo, t.Coordinates)
|
|
|
|
}
|
|
|
|
|
|
|
|
func latLong(g *geo, c *coords) (lat, long float64, ok bool) {
|
|
|
|
if g != nil && len(g.Coordinates) == 2 {
|
|
|
|
co := g.Coordinates
|
|
|
|
if co[0] != 0 && co[1] != 0 {
|
|
|
|
return co[0], co[1], true
|
2014-06-14 21:37:41 +00:00
|
|
|
}
|
|
|
|
}
|
2014-06-17 01:32:05 +00:00
|
|
|
if c != nil && len(c.Coordinates) == 2 {
|
2014-07-11 14:53:29 +00:00
|
|
|
co := c.Coordinates
|
2014-06-17 01:32:05 +00:00
|
|
|
if co[0] != 0 && co[1] != 0 {
|
|
|
|
return co[1], co[0], true
|
2014-06-14 21:37:41 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
func (t *zipTweetItem) Media() (ret []tweetMedia) {
|
|
|
|
for _, m := range t.Entities.Media {
|
|
|
|
ret = append(ret, m)
|
|
|
|
}
|
|
|
|
ret = append(ret, getImagesFromURLs(t.Entities.URLs)...)
|
|
|
|
return
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (t *apiTweetItem) Media() (ret []tweetMedia) {
|
|
|
|
for _, m := range t.Entities.Media {
|
|
|
|
ret = append(ret, m)
|
|
|
|
}
|
2014-06-19 23:23:10 +00:00
|
|
|
ret = append(ret, getImagesFromURLs(t.Entities.URLs)...)
|
2014-06-17 01:32:05 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2014-06-14 21:37:41 +00:00
|
|
|
type geo struct {
|
|
|
|
Coordinates []float64 `json:"coordinates"` // lat,long
|
|
|
|
}
|
|
|
|
|
|
|
|
type coords struct {
|
|
|
|
Coordinates []float64 `json:"coordinates"` // long,lat
|
|
|
|
}
|
2014-06-17 01:32:05 +00:00
|
|
|
|
|
|
|
type entities struct {
|
2014-06-19 23:23:10 +00:00
|
|
|
Media []*media `json:"media"`
|
|
|
|
URLs []*urlEntity `json:"urls"`
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
type zipEntities struct {
|
|
|
|
Media []*zipMedia `json:"media"`
|
|
|
|
URLs []*urlEntity `json:"urls"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// e.g. {
|
|
|
|
// "indices" : [ 105, 125 ],
|
|
|
|
// "url" : "http:\/\/t.co\/gbGO8Qep",
|
|
|
|
// "expanded_url" : "http:\/\/twitpic.com\/6mdqac",
|
|
|
|
// "display_url" : "twitpic.com\/6mdqac"
|
|
|
|
// }
|
|
|
|
type urlEntity struct {
|
|
|
|
URL string `json:"url"`
|
|
|
|
ExpandedURL string `json:"expanded_url"`
|
|
|
|
DisplayURL string `json:"display_url"`
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
twitpicRx = regexp.MustCompile(`\btwitpic\.com/(\w\w\w+)`)
|
|
|
|
imgurRx = regexp.MustCompile(`\bimgur\.com/(\w\w\w+)`)
|
|
|
|
)
|
|
|
|
|
|
|
|
func getImagesFromURLs(urls []*urlEntity) (ret []tweetMedia) {
|
|
|
|
// TODO: extract these regexps from tweet text too. Happens in
|
|
|
|
// a few cases I've seen in my history.
|
|
|
|
for _, u := range urls {
|
|
|
|
if strings.HasPrefix(u.DisplayURL, "twitpic.com") {
|
|
|
|
ret = append(ret, twitpicImage(strings.TrimPrefix(u.DisplayURL, "twitpic.com/")))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if m := imgurRx.FindStringSubmatch(u.DisplayURL); m != nil {
|
|
|
|
ret = append(ret, imgurImage(m[1]))
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// The Media entity from the Rest API. See also: zipMedia.
|
2014-06-17 01:32:05 +00:00
|
|
|
type media struct {
|
2014-06-19 23:23:10 +00:00
|
|
|
Id string `json:"id_str"`
|
|
|
|
IdNum int64 `json:"id"`
|
|
|
|
MediaURL string `json:"media_url"`
|
|
|
|
MediaURLHTTPS string `json:"media_url_https"`
|
|
|
|
Sizes map[string]mediaSize `json:"sizes"`
|
|
|
|
Type_ string `json:"type"`
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
// The Media entity from the zip file JSON. Similar but different to
|
|
|
|
// media. Thanks, Twitter.
|
|
|
|
type zipMedia struct {
|
|
|
|
Id string `json:"id_str"`
|
|
|
|
IdNum int64 `json:"id"`
|
|
|
|
MediaURL string `json:"media_url"`
|
|
|
|
MediaURLHTTPS string `json:"media_url_https"`
|
|
|
|
Sizes []mediaSize `json:"sizes"` // without a key! useless.
|
|
|
|
}
|
2014-06-17 01:32:05 +00:00
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
func (m *media) URLs() []string {
|
2014-06-17 01:32:05 +00:00
|
|
|
u := m.baseURL()
|
|
|
|
if u == "" {
|
2014-06-19 23:23:10 +00:00
|
|
|
return nil
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
2014-06-19 23:23:10 +00:00
|
|
|
return []string{u + m.largestMediaSuffix(), u}
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
func (m *zipMedia) URLs() []string {
|
|
|
|
// We don't get any suffix names, so just try some common
|
|
|
|
// ones. The first non-404 will be used:
|
|
|
|
u := m.baseURL()
|
|
|
|
if u == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return []string{
|
|
|
|
u + ":large",
|
|
|
|
u,
|
|
|
|
}
|
|
|
|
}
|
2014-06-17 01:32:05 +00:00
|
|
|
|
|
|
|
func (m *media) baseURL() string {
|
|
|
|
if v := m.MediaURLHTTPS; v != "" {
|
|
|
|
return v
|
|
|
|
}
|
|
|
|
return m.MediaURL
|
|
|
|
}
|
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
func (m *zipMedia) baseURL() string {
|
|
|
|
if v := m.MediaURLHTTPS; v != "" {
|
|
|
|
return v
|
|
|
|
}
|
|
|
|
return m.MediaURL
|
|
|
|
}
|
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
func (m *media) BaseFilename() string {
|
|
|
|
return path.Base(m.baseURL())
|
|
|
|
}
|
|
|
|
|
2014-06-19 23:23:10 +00:00
|
|
|
func (m *zipMedia) BaseFilename() string {
|
|
|
|
return path.Base(m.baseURL())
|
|
|
|
}
|
|
|
|
|
2014-06-17 01:32:05 +00:00
|
|
|
func (m *media) largestMediaSuffix() string {
|
|
|
|
bestPixels := 0
|
|
|
|
bestSuffix := ""
|
|
|
|
for k, sz := range m.Sizes {
|
|
|
|
if px := sz.W * sz.H; px > bestPixels {
|
|
|
|
bestPixels = px
|
|
|
|
bestSuffix = ":" + k
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return bestSuffix
|
|
|
|
}
|
|
|
|
|
|
|
|
type mediaSize struct {
|
2014-06-19 23:23:10 +00:00
|
|
|
W int `json:"w"`
|
|
|
|
H int `json:"h"`
|
|
|
|
Resize string `json:"resize"`
|
|
|
|
}
|
|
|
|
|
|
|
|
// An image from twitpic.
|
|
|
|
type twitpicImage string
|
|
|
|
|
|
|
|
func (im twitpicImage) BaseFilename() string { return string(im) }
|
|
|
|
|
|
|
|
func (im twitpicImage) URLs() []string {
|
|
|
|
return []string{"https://twitpic.com/show/large/" + string(im)}
|
|
|
|
}
|
|
|
|
|
|
|
|
// An image from imgur
|
|
|
|
type imgurImage string
|
|
|
|
|
|
|
|
func (im imgurImage) BaseFilename() string { return string(im) }
|
|
|
|
|
|
|
|
func (im imgurImage) URLs() []string {
|
|
|
|
// Imgur ignores the suffix if it's .gif, .png, or .jpg. So just pick .gif.
|
|
|
|
// The actual content will be returned.
|
|
|
|
return []string{"https://i.imgur.com/" + string(im) + ".gif"}
|
2014-06-17 01:32:05 +00:00
|
|
|
}
|