go-feedparser

Simple RSS and ATOM feed parser

git clone https://git.8pit.net/go-feedparser.git

  1// This program is free software: you can redistribute it and/or modify
  2// it under the terms of the GNU General Public License as published by
  3// the Free Software Foundation, either version 3 of the License, or
  4// (at your option) any later version.
  5//
  6// This program is distributed in the hope that it will be useful,
  7// but WITHOUT ANY WARRANTY; without even the implied warranty of
  8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9// GNU General Public License for more details.
 10//
 11// You should have received a copy of the GNU General Public License
 12// along with this program. If not, see <http://www.gnu.org/licenses/>.
 13
 14package feedparser
 15
 16import (
 17	"encoding/xml"
 18)
 19
 20// AtomFeed represents an atom web feed.
 21type AtomFeed struct {
 22	// XMLName.
 23	XMLName xml.Name `xml:"feed"`
 24
 25	// Universally unique feed ID (required).
 26	ID string `xml:"id"`
 27
 28	// Human readable title for the feed (required).
 29	Title AtomText `xml:"title"`
 30
 31	// Last time the feed was significantly modified (required).
 32	Updated string `xml:"updated"`
 33
 34	// Entries for the feed (required).
 35	Entries []AtomEntry `xml:"entry"`
 36
 37	// Authors of the feed (recommended).
 38	Authors []AtomPerson `xml:"author"`
 39
 40	// Links which identify related web pages (recommended).
 41	Links []AtomLink `xml:"link"`
 42
 43	// Categories the feed belongs to (optional).
 44	Categories []AtomCategory `xml:"category"`
 45
 46	// Contributors to the feed (optional).
 47	Contributors []AtomPerson `xml:"contributor"`
 48
 49	// Software used to generate the feed (optional).
 50	Generator AtomGenerator `xml:"generator"`
 51
 52	// Small icon used for visual identification (optional).
 53	Icon string `xml:"icon"`
 54
 55	// Larger logo for visual identification (optional).
 56	Logo string `xml:"logo"`
 57
 58	// Information about rights, for example copyrights (optional).
 59	Rights AtomText `xml:"rights"`
 60
 61	// Human readable description or subtitle (optional).
 62	Subtitle AtomText `xml:"subtitle"`
 63}
 64
 65// AtomEntry represents an atom entry.
 66type AtomEntry struct {
 67	// Universally unique feed ID (required).
 68	ID string `xml:"id"`
 69
 70	// Human readable title for the entry (required).
 71	Title AtomText `xml:"title"`
 72
 73	// Last time the feed was significantly modified (required).
 74	Updated string `xml:"updated"`
 75
 76	// Authors of the entry (recommended).
 77	Authors []AtomPerson `xml:"author"`
 78
 79	// Content of the entry (recommended).
 80	Content AtomText `xml:"content"`
 81
 82	// Links which identify related web pages (recommended).
 83	Links []AtomLink `xml:"link"`
 84
 85	// Short summary, abstract or excerpt of the entry (recommended).
 86	Summary AtomText `xml:"summary"`
 87
 88	// Categories the entry belongs too (optional).
 89	Categories []AtomCategory `xml:"category"`
 90
 91	// Contributors to the entry (optional).
 92	Contributors []AtomPerson `xml:"contributor"`
 93
 94	// Time of the initial creation of the entry (optional).
 95	Published string `xml:"published"`
 96
 97	// FIXME
 98	// Feed's metadata, only used when entry was copied from another feed (optional).
 99	// Source AtomFeed `xml:"source"`
100
101	// Information about rights, for example copyrights (optional).
102	Rights AtomText `xml:"rights"`
103}
104
105// AtomLink represents the atom link tag.
106type AtomLink struct {
107	// Hypertext reference (required).
108	Href string `xml:"href,attr"`
109
110	// Single Link relation type (optional).
111	Rel string `xml:"rel,attr"`
112
113	// Media type of the resource (optional).
114	Type string `xml:"type,attr"`
115
116	// Language of referenced resource (optional).
117	HrefLang string `xml:"hreflang,attr"`
118
119	// Human readable information about the link (optional).
120	Title string `xml:"title,attr"`
121
122	// Length of the resource in bytes (optional).
123	Length string `xml:"length,attr"`
124}
125
126// AtomPerson represents a person, corporation, et cetera.
127type AtomPerson struct {
128	// Human readable name for the person (required).
129	Name string `xml:"name"`
130
131	// Home page for the person (optional).
132	URI string `xml:"uri"`
133
134	// Email address for the person (optional).
135	Email string `xml:"email"`
136}
137
138// AtomCategory identifies the category.
139type AtomCategory struct {
140	// Identifier for this category (required).
141	Term string `xml:"term,attr"`
142
143	// Categorization scheme via a URI (optional).
144	Scheme string `xml:"scheme,attr"`
145
146	// Human readable label for display (optional).
147	Label string `xml:"label,attr"`
148}
149
150// AtomGenerator identifies the generator.
151type AtomGenerator struct {
152	// Generator name (required).
153	Name string `xml:",chardata"`
154
155	// URI for this generator (optional).
156	URI string `xml:"uri,attr"`
157
158	// Version for this generator (optional).
159	Version string `xml:"version,attr"`
160}
161
162// AtomText identifies human readable text.
163type AtomText struct {
164	// Text body (required).
165	Body string `xml:",chardata"`
166
167	// InnerXML data (optional).
168	InnerXML string `xml:",innerxml"`
169
170	// Text type (optional).
171	Type string `xml:"type,attr"`
172
173	// URI where the content can be found (optional for <content>).
174	URI string `xml:"uri,att"`
175}
176
177// parseAtom parses an atom feed and returns a generic feed.
178func parseAtom(data []byte) (f Feed, err error) {
179	var origFeed AtomFeed
180	if err = unmarshal(data, &origFeed); err != nil {
181		return
182	}
183
184	f = Feed{
185		Type:        "atom",
186		Title:       origFeed.Title.Body,
187		Link:        findLink(origFeed.Links).Href,
188		Description: origFeed.Subtitle.Body,
189		Image:       origFeed.Logo,
190		Generator:   origFeed.Generator.Name,
191		Rights:      origFeed.Rights.Body,
192	}
193
194	if len(origFeed.Authors) > 0 {
195		f.Author = origFeed.Authors[0].Email
196	}
197
198	f.Updated, err = parseTime(origFeed.Updated)
199	if err != nil {
200		return
201	}
202
203	for _, category := range origFeed.Categories {
204		f.Categories = append(f.Categories, category.Term)
205	}
206
207	for _, entry := range origFeed.Entries {
208		item := Item{
209			ID:         entry.ID,
210			Title:      entry.Title.Body,
211			Link:       findLink(entry.Links).Href,
212			Content:    entry.Content.Body,
213			Attachment: findAttachment(entry.Links).Href,
214		}
215
216		if len(entry.Authors) > 0 {
217			item.Author = entry.Authors[0].Email
218		}
219
220		for _, category := range entry.Categories {
221			item.Categories = append(item.Categories, category.Term)
222		}
223
224		timeStr := entry.Updated
225		if len(entry.Published) > 0 {
226			timeStr = entry.Published
227		}
228
229		item.PubDate, err = parseTime(timeStr)
230		if err != nil {
231			return
232		}
233
234		f.Items = append(f.Items, item)
235	}
236
237	return
238}
239
240// findLink attempts to find the most relevant link.
241func findLink(links []AtomLink) AtomLink {
242	var score int
243	var match AtomLink
244
245	for _, link := range links {
246		switch {
247		case link.Rel == "alternate" && link.Type == "text/html":
248			return link
249		case score < 3 && link.Type == "text/html":
250			score = 3
251			match = link
252		case score < 2 && link.Rel == "self":
253			score = 2
254			match = link
255		case score < 1 && link.Rel == "":
256			score = 1
257			match = link
258		case &match == nil:
259			match = link
260		}
261	}
262
263	return match
264}
265
266// findAttachment attempts to find a link which represents an attachment.
267func findAttachment(links []AtomLink) AtomLink {
268	for _, link := range links {
269		if link.Rel == "enclosure" {
270			return link
271		}
272	}
273
274	return AtomLink{}
275}