perkeep/pkg/importer/feed/parse.go

458 lines
11 KiB
Go

/*
Copyright 2014 The Perkeep Authors
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package feed
import (
"bytes"
"encoding/xml"
"fmt"
"html"
"log"
"net/url"
"strings"
"time"
"golang.org/x/net/html/charset"
"perkeep.org/pkg/importer/feed/atom"
"perkeep.org/pkg/importer/feed/rdf"
"perkeep.org/pkg/importer/feed/rss"
)
type feed struct {
Title string
Updated time.Time
Link string
Items []*item
}
type item struct {
ID string
Title string
Link string
Created time.Time
Published time.Time
Updated time.Time
Author string
Content string
MediaContent string
}
func parseFeed(body []byte, feedURL string) (*feed, error) {
var f *feed
var atomerr, rsserr, rdferr error
f, atomerr = parseAtom(body)
if f == nil {
f, rsserr = parseRSS(body)
}
if f == nil {
f, rdferr = parseRDF(body)
}
if f == nil {
log.Printf("atom parse error: %s", atomerr.Error())
log.Printf("xml parse error: %s", rsserr.Error())
log.Printf("rdf parse error: %s", rdferr.Error())
return nil, fmt.Errorf("Could not parse feed data")
}
return f, nil
}
func parseAtom(body []byte) (*feed, error) {
var f feed
var a atom.Feed
d := xml.NewDecoder(bytes.NewReader(body))
d.CharsetReader = charset.NewReaderLabel
if err := d.Decode(&a); err != nil {
return nil, err
}
f.Title = a.Title
if t, err := parseDate(string(a.Updated)); err == nil {
f.Updated = t
}
fb, err := url.Parse(a.XMLBase)
if err != nil {
fb, _ = url.Parse("")
}
if len(a.Link) > 0 {
f.Link = findBestAtomLink(a.Link)
if l, err := fb.Parse(f.Link); err == nil {
f.Link = l.String()
}
}
for _, i := range a.Entry {
eb, err := fb.Parse(i.XMLBase)
if err != nil {
eb = fb
}
st := item{
ID: i.ID,
Title: atomTitle(i.Title),
}
if t, err := parseDate(string(i.Updated)); err == nil {
st.Updated = t
}
if t, err := parseDate(string(i.Published)); err == nil {
st.Published = t
}
if len(i.Link) > 0 {
st.Link = findBestAtomLink(i.Link)
if l, err := eb.Parse(st.Link); err == nil {
st.Link = l.String()
}
}
if i.Author != nil {
st.Author = i.Author.Name
}
if i.Content != nil {
if len(strings.TrimSpace(i.Content.Body)) != 0 {
st.Content = i.Content.Body
} else if len(i.Content.InnerXML) != 0 {
st.Content = i.Content.InnerXML
}
} else if i.Summary != nil {
st.Content = i.Summary.Body
}
f.Items = append(f.Items, &st)
}
return &f, nil
}
func parseRSS(body []byte) (*feed, error) {
var f feed
var r rss.RSS
d := xml.NewDecoder(bytes.NewReader(body))
d.CharsetReader = charset.NewReaderLabel
d.DefaultSpace = "DefaultSpace"
if err := d.Decode(&r); err != nil {
return nil, err
}
f.Title = r.Title
if t, err := parseDate(r.LastBuildDate, r.PubDate); err == nil {
f.Updated = t
}
f.Link = r.BaseLink()
for _, i := range r.Items {
st := item{
Link: i.Link,
Author: i.Author,
}
if i.Content != "" {
st.Content = i.Content
} else if i.Description != "" {
st.Content = i.Description
}
if i.Title != "" {
st.Title = i.Title
} else if i.Description != "" {
st.Title = i.Description
}
if st.Content == st.Title {
st.Title = ""
}
st.Title = textTitle(st.Title)
if i.Guid != nil {
st.ID = i.Guid.Guid
}
if i.Enclosure != nil && strings.HasPrefix(i.Enclosure.Type, "audio/") {
st.MediaContent = i.Enclosure.Url
} else if i.Media != nil && strings.HasPrefix(i.Media.Type, "audio/") {
st.MediaContent = i.Media.URL
}
if t, err := parseDate(i.PubDate, i.Date, i.Published); err == nil {
st.Published = t
st.Updated = t
}
f.Items = append(f.Items, &st)
}
return &f, nil
}
func parseRDF(body []byte) (*feed, error) {
var f feed
var rd rdf.RDF
d := xml.NewDecoder(bytes.NewReader(body))
d.CharsetReader = charset.NewReaderLabel
if err := d.Decode(&rd); err != nil {
return nil, err
}
if rd.Channel != nil {
f.Title = rd.Channel.Title
f.Link = rd.Channel.Link
if t, err := parseDate(rd.Channel.Date); err == nil {
f.Updated = t
}
}
for _, i := range rd.Item {
st := item{
ID: i.About,
Title: textTitle(i.Title),
Link: i.Link,
Author: i.Creator,
}
if len(i.Description) > 0 {
st.Content = html.UnescapeString(i.Description)
} else if len(i.Content) > 0 {
st.Content = html.UnescapeString(i.Content)
}
if t, err := parseDate(i.Date); err == nil {
st.Published = t
st.Updated = t
}
f.Items = append(f.Items, &st)
}
return &f, nil
}
func textTitle(t string) string {
return html.UnescapeString(t)
}
func atomTitle(t *atom.Text) string {
if t == nil {
return ""
}
if t.Type == "html" {
// see: https://github.com/mjibson/goread/blob/59aec794f3ef87b36c1bac029438c33a6aa6d8d3/utils.go#L533
//return html.UnescapeString(sanitizer.StripTags(t.Body))
}
return textTitle(t.Body)
}
func findBestAtomLink(links []atom.Link) string {
getScore := func(l atom.Link) int {
switch {
case l.Rel == "hub":
return 0
case l.Rel == "alternate" && l.Type == "text/html":
return 5
case l.Type == "text/html":
return 4
case l.Rel == "self":
return 2
case l.Rel == "":
return 3
default:
return 1
}
}
var bestlink string
bestscore := -1
for _, l := range links {
score := getScore(l)
if score > bestscore {
bestlink = l.Href
bestscore = score
}
}
return bestlink
}
var dateFormats = []string{
"01-02-2006",
"01/02/2006",
"01/02/2006 - 15:04",
"01/02/2006 15:04:05 MST",
"01/02/2006 3:04 PM",
"02-01-2006",
"02/01/2006",
"02.01.2006 -0700",
"02/01/2006 - 15:04",
"02.01.2006 15:04",
"02/01/2006 15:04:05",
"02.01.2006 15:04:05",
"02-01-2006 15:04:05 MST",
"02/01/2006 15:04 MST",
"02 Jan 2006",
"02 Jan 2006 15:04:05",
"02 Jan 2006 15:04:05 -0700",
"02 Jan 2006 15:04:05 MST",
"02 Jan 2006 15:04:05 UT",
"02 Jan 2006 15:04 MST",
"02 Monday, Jan 2006 15:04",
"06-1-2 15:04",
"06/1/2 15:04",
"1/2/2006",
"1/2/2006 15:04:05 MST",
"1/2/2006 3:04:05 PM",
"1/2/2006 3:04:05 PM MST",
"15:04 02.01.2006 -0700",
"2006-01-02",
"2006/01/02",
"2006-01-02 00:00:00.0 15:04:05.0 -0700",
"2006-01-02 15:04",
"2006-01-02 15:04:05 -0700",
"2006-01-02 15:04:05-07:00",
"2006-01-02 15:04:05-0700",
"2006-01-02 15:04:05 MST",
"2006-01-02 15:04:05Z",
"2006-01-02 at 15:04:05",
"2006-01-02T15:04:05",
"2006-01-02T15:04:05:00",
"2006-01-02T15:04:05 -0700",
"2006-01-02T15:04:05-07:00",
"2006-01-02T15:04:05-0700",
"2006-01-02T15:04:05:-0700",
"2006-01-02T15:04:05-07:00:00",
"2006-01-02T15:04:05Z",
"2006-01-02T15:04-07:00",
"2006-01-02T15:04Z",
"2006-1-02T15:04:05Z",
"2006-1-2",
"2006-1-2 15:04:05",
"2006-1-2T15:04:05Z",
"2006 January 02",
"2-1-2006",
"2/1/2006",
"2.1.2006 15:04:05",
"2 Jan 2006",
"2 Jan 2006 15:04:05 -0700",
"2 Jan 2006 15:04:05 MST",
"2 Jan 2006 15:04:05 Z",
"2 January 2006",
"2 January 2006 15:04:05 -0700",
"2 January 2006 15:04:05 MST",
"6-1-2 15:04",
"6/1/2 15:04",
"Jan 02, 2006",
"Jan 02 2006 03:04:05PM",
"Jan 2, 2006",
"Jan 2, 2006 15:04:05 MST",
"Jan 2, 2006 3:04:05 PM",
"Jan 2, 2006 3:04:05 PM MST",
"January 02, 2006",
"January 02, 2006 03:04 PM",
"January 02, 2006 15:04",
"January 02, 2006 15:04:05 MST",
"January 2, 2006",
"January 2, 2006 03:04 PM",
"January 2, 2006 15:04:05",
"January 2, 2006 15:04:05 MST",
"January 2, 2006, 3:04 p.m.",
"January 2, 2006 3:04 PM",
"Mon, 02 Jan 06 15:04:05 MST",
"Mon, 02 Jan 2006",
"Mon, 02 Jan 2006 15:04:05",
"Mon, 02 Jan 2006 15:04:05 00",
"Mon, 02 Jan 2006 15:04:05 -07",
"Mon 02 Jan 2006 15:04:05 -0700",
"Mon, 02 Jan 2006 15:04:05 --0700",
"Mon, 02 Jan 2006 15:04:05 -07:00",
"Mon, 02 Jan 2006 15:04:05 -0700",
"Mon,02 Jan 2006 15:04:05 -0700",
"Mon, 02 Jan 2006 15:04:05 GMT-0700",
"Mon , 02 Jan 2006 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05MST",
"Mon, 02 Jan 2006, 15:04:05 MST",
"Mon, 02 Jan 2006 15:04:05 MST -0700",
"Mon, 02 Jan 2006 15:04:05 MST-07:00",
"Mon, 02 Jan 2006 15:04:05 UT",
"Mon, 02 Jan 2006 15:04:05 Z",
"Mon, 02 Jan 2006 15:04 -0700",
"Mon, 02 Jan 2006 15:04 MST",
"Mon,02 Jan 2006 15:04 MST",
"Mon, 02 Jan 2006 15 -0700",
"Mon, 02 Jan 2006 3:04:05 PM MST",
"Mon, 02 January 2006",
"Mon,02 January 2006 14:04:05 MST",
"Mon, 2006-01-02 15:04",
"Mon, 2 Jan 06 15:04:05 -0700",
"Mon, 2 Jan 06 15:04:05 MST",
"Mon, 2 Jan 15:04:05 MST",
"Mon, 2 Jan 2006",
"Mon,2 Jan 2006",
"Mon, 2 Jan 2006 15:04",
"Mon, 2 Jan 2006 15:04:05",
"Mon, 2 Jan 2006 15:04:05 -0700",
"Mon, 2 Jan 2006 15:04:05-0700",
"Mon, 2 Jan 2006 15:04:05 -0700 MST",
"mon,2 Jan 2006 15:04:05 MST",
"Mon 2 Jan 2006 15:04:05 MST",
"Mon, 2 Jan 2006 15:04:05 MST",
"Mon, 2 Jan 2006 15:04:05MST",
"Mon, 2 Jan 2006 15:04:05 UT",
"Mon, 2 Jan 2006 15:04 -0700",
"Mon, 2 Jan 2006, 15:04 -0700",
"Mon, 2 Jan 2006 15:04 MST",
"Mon, 2, Jan 2006 15:4",
"Mon, 2 Jan 2006 15:4:5 -0700 GMT",
"Mon, 2 Jan 2006 15:4:5 MST",
"Mon, 2 Jan 2006 3:04:05 PM -0700",
"Mon, 2 January 2006",
"Mon, 2 January 2006 15:04:05 -0700",
"Mon, 2 January 2006 15:04:05 MST",
"Mon, 2 January 2006, 15:04:05 MST",
"Mon, 2 January 2006, 15:04 -0700",
"Mon, 2 January 2006 15:04 MST",
"Monday, 02 January 2006 15:04:05",
"Monday, 02 January 2006 15:04:05 -0700",
"Monday, 02 January 2006 15:04:05 MST",
"Monday, 2 Jan 2006 15:04:05 -0700",
"Monday, 2 Jan 2006 15:04:05 MST",
"Monday, 2 January 2006 15:04:05 -0700",
"Monday, 2 January 2006 15:04:05 MST",
"Monday, January 02, 2006",
"Monday, January 2, 2006",
"Monday, January 2, 2006 03:04 PM",
"Monday, January 2, 2006 15:04:05 MST",
"Mon Jan 02 2006 15:04:05 -0700",
"Mon, Jan 02,2006 15:04:05 MST",
"Mon Jan 02, 2006 3:04 pm",
"Mon Jan 2 15:04:05 2006 MST",
"Mon Jan 2 15:04 2006",
"Mon, Jan 2 2006 15:04:05 -0700",
"Mon, Jan 2 2006 15:04:05 -700",
"Mon, Jan 2, 2006 15:04:05 MST",
"Mon, Jan 2 2006 15:04 MST",
"Mon, Jan 2, 2006 15:04 MST",
"Mon, January 02, 2006 15:04:05 MST",
"Mon, January 02, 2006, 15:04:05 MST",
"Mon, January 2 2006 15:04:05 -0700",
"Updated January 2, 2006",
time.ANSIC,
time.RFC1123,
time.RFC1123Z,
time.RFC3339,
time.RFC822,
time.RFC822Z,
time.RFC850,
time.RubyDate,
time.UnixDate,
}
func parseDate(ds ...string) (t time.Time, err error) {
for _, d := range ds {
d = strings.TrimSpace(d)
if d == "" {
continue
}
for _, f := range dateFormats {
if t, err = time.Parse(f, d); err == nil {
return
}
}
}
err = fmt.Errorf("could not parse dates: %v", strings.Join(ds, ", "))
return
}