Merge "importer/twitter: add likes import"

This commit is contained in:
Mathieu Lonjaret 2018-08-21 23:53:37 +00:00 committed by Gerrit Code Review
commit 9a49f0959d
4 changed files with 88 additions and 11 deletions

View File

@ -75,6 +75,9 @@ escaping. For example:
rest of whole expression (regardless of its position within the expression),
logical 'or's around it are forbidden.
**<a name="like"></a>is:like**
: object is a liked tweet
**<a name="image"></a>is:image**
: object is an image

View File

@ -56,6 +56,7 @@ const (
tokenRequestURL = "https://api.twitter.com/oauth/access_token"
userInfoAPIPath = "account/verify_credentials.json"
userTimeLineAPIPath = "statuses/user_timeline.json"
userLikesAPIPath = "favorites/list.json"
// runCompleteVersion is a cache-busting version number of the
// importer code. It should be incremented whenever the
@ -75,6 +76,13 @@ const (
// ... and re-do an import.
acctAttrTweetZip = "twitterArchiveZipFileRef"
// acctAttrImportLikes specifies an optional attribte for the account permanode.
// If set to true likes are imported via the twitter API.
// You can enable importing likes like this:
// $ pk-put attr <acct-permanode> twitterImportLikes true
// ... and re-do an import.
acctAttrImportLikes = "twitterImportLikes"
// acctAttrZipDoneVersion is updated at the end of a successful zip import and
// is used to determine whether the zip file needs to be re-imported in a future run.
acctAttrZipDoneVersion = "twitterZipDoneVersion" // == "<fileref>:<runCompleteVersion>"
@ -84,6 +92,12 @@ const (
tweetRequestLimit = 200 // max number of tweets we can get in a user_timeline request
tweetsAtOnce = 20 // how many tweets to import at once
// A tweet is stored as a permanode with the "twitter.com:tweet" camliNodeType value.
nodeTypeTweet = "twitter.com:tweet"
// A like is stored as a permanode with the "twitter.com:like" camliNodeType value.
nodeTypeLike = "twitter.com:like"
)
var oAuthURIs = importer.OAuthURIs{
@ -169,6 +183,10 @@ return the zip-fileref), and signal the twitter importer that you have it, with<
"pk-put attr &lt;acct-permanode&gt; twitterArchiveZipFileRef &lt;zip-fileref&gt;".<br>
Then you can start running the importer.
</p>
<p>
If you want to import likes as well, please run <br>
"pk-put attr &lt;acct-permanode&gt; twitterImportLikes true" to enable it.
</p>
`, base, base+"/callback")
}
@ -225,7 +243,18 @@ func (im *imp) Run(ctx *importer.RunContext) error {
skipAPITweets, _ := strconv.ParseBool(os.Getenv("CAMLI_TWITTER_SKIP_API_IMPORT"))
if !skipAPITweets {
if err := r.importTweets(userID); err != nil {
if err := r.importTweets(userID, userTimeLineAPIPath); err != nil {
return err
}
}
acctNode, err = ctx.Host.ObjectFromRef(acctNode.PermanodeRef())
if err != nil {
return fmt.Errorf("error reloading account node: %v", err)
}
importLikes, err := strconv.ParseBool(acctNode.Attr(acctAttrImportLikes))
if err == nil && importLikes {
if err := r.importTweets(userID, userLikesAPIPath); err != nil {
return err
}
}
@ -339,11 +368,22 @@ func (r *run) doAPI(result interface{}, apiPath string, keyval ...string) error
r.accessCreds}.PopulateJSONFromURL(result, apiURL+apiPath, keyval...)
}
func (r *run) importTweets(userID string) error {
// importTweets imports the tweets related to userID, through apiPath.
// If apiPath is userTimeLineAPIPath, the tweets and retweets posted by userID are imported.
// If apiPath is userLikesAPIPath, the tweets liked by userID are imported.
func (r *run) importTweets(userID string, apiPath string) error {
maxId := ""
continueRequests := true
tweetsNode, err := r.getTopLevelNode("tweets")
var tweetsNode *importer.Object
var err error
var importType string
if apiPath == userLikesAPIPath {
importType = "likes"
} else {
importType = "tweets"
}
tweetsNode, err = r.getTopLevelNode(importType)
if err != nil {
return err
}
@ -368,11 +408,11 @@ func (r *run) importTweets(userID string) error {
var resp []*apiTweetItem
var err error
if maxId == "" {
log.Printf("twitter: fetching tweets for userid %s", userID)
err = r.doAPI(&resp, userTimeLineAPIPath, attrs...)
log.Printf("twitter: fetching %s for userid %s", importType, userID)
err = r.doAPI(&resp, apiPath, attrs...)
} else {
log.Printf("twitter: fetching tweets for userid %s with max ID %s", userID, maxId)
err = r.doAPI(&resp, userTimeLineAPIPath,
log.Printf("twitter: fetching %s for userid %s with max ID %s", userID, importType, maxId)
err = r.doAPI(&resp, apiPath,
append(attrs, "max_id", maxId)...)
}
if err != nil {
@ -416,14 +456,14 @@ func (r *run) importTweets(userID string) error {
return err
}
numTweets += newThisBatch
log.Printf("twitter: imported %d tweets this batch; %d total.", newThisBatch, numTweets)
log.Printf("twitter: imported %d %s this batch; %d total.", newThisBatch, importType, numTweets)
if r.incremental && allDups {
log.Printf("twitter: incremental import found end batch")
break
}
continueRequests = newThisBatch > 0
}
log.Printf("twitter: successfully did full run of importing %d tweets", numTweets)
log.Printf("twitter: successfully did full run of importing %d %s", numTweets, importType)
return nil
}
@ -533,9 +573,14 @@ func (r *run) importTweet(parent *importer.Object, tweet tweetItem, viaAPI bool)
r.AccountNode().Attr(importer.AcctAttrUserName),
id)
nodeType := nodeTypeTweet
if tweet.Liked() {
nodeType = nodeTypeLike
}
attrs := []string{
"twitterId", id,
nodeattr.Type, "twitter.com:tweet",
nodeattr.Type, nodeType,
nodeattr.StartDate, schema.RFC3339FromTime(createdTime),
nodeattr.Content, tweet.Text(),
nodeattr.URL, url,
@ -617,6 +662,8 @@ func (r *run) getTopLevelNode(path string) (*importer.Object, error) {
switch path {
case "tweets":
title = fmt.Sprintf("%s's Tweets", acctNode.Attr(importer.AcctAttrUserName))
case "likes":
title = fmt.Sprintf("%s's Likes", acctNode.Attr(importer.AcctAttrUserName))
}
return obj, obj.SetAttr(nodeattr.Title, title)
}
@ -732,6 +779,7 @@ type tweetItem interface {
CreatedAt() string
Text() string
Media() []tweetMedia
Liked() bool
}
type tweetMedia interface {
@ -744,6 +792,7 @@ type apiTweetItem struct {
TextStr string `json:"text"`
CreatedAtStr string `json:"created_at"`
Entities entities `json:"entities"`
Favorited bool `json:"favorited"`
// One or both might be present:
Geo *geo `json:"geo"` // lat, long
@ -822,6 +871,9 @@ func (t *apiTweetItem) Media() (ret []tweetMedia) {
return
}
func (t *apiTweetItem) Liked() bool { return t.Favorited }
func (t *zipTweetItem) Liked() bool { return false }
type geo struct {
Coordinates []float64 `json:"coordinates"` // lat,long
}

View File

@ -126,6 +126,7 @@ func init() {
// Custom predicates
registerKeyword(newIsPost())
registerKeyword(newIsLike())
registerKeyword(newIsCheckin())
// Location predicates
@ -839,6 +840,27 @@ func (k isPost) Predicate(ctx context.Context, args []string) (*Constraint, erro
}, nil
}
type isLike struct {
matchEqual
}
func newIsLike() keyword {
return isLike{"is:like"}
}
func (k isLike) Description() string {
return "matches liked tweets"
}
func (k isLike) Predicate(ctx context.Context, args []string) (*Constraint, error) {
return &Constraint{
Permanode: &PermanodeConstraint{
Attr: nodeattr.Type,
Value: "twitter.com:like",
},
}, nil
}
type isCheckin struct {
matchEqual
}

View File

@ -85,7 +85,7 @@ cam.BlobItemTwitterContent.getHandler = function(blobref, searchSession, href) {
return null;
}
if (cam.permanodeUtils.getCamliNodeType(m.permanode) != 'twitter.com:tweet') {
if (cam.permanodeUtils.getCamliNodeType(m.permanode) != 'twitter.com:tweet' && cam.permanodeUtils.getCamliNodeType(m.permanode) != 'twitter.com:like') {
return null;
}