go-feedparser

Simple RSS and ATOM feed parser

git clone https://git.8pit.net/go-feedparser.git

  1// This program is free software: you can redistribute it and/or modify
  2// it under the terms of the GNU General Public License as published by
  3// the Free Software Foundation, either version 3 of the License, or
  4// (at your option) any later version.
  5//
  6// This program is distributed in the hope that it will be useful,
  7// but WITHOUT ANY WARRANTY; without even the implied warranty of
  8// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9// GNU General Public License for more details.
 10//
 11// You should have received a copy of the GNU General Public License
 12// along with this program. If not, see <http://www.gnu.org/licenses/>.
 13//
 14// This is a slightly modified version of 'encoding/xml/read_test.go'.
 15// Copyright 2009 The Go Authors. All rights reserved. Use of this
 16// source code is governed by a BSD-style license that can be found in
 17// the LICENSE file.
 18
 19package feedparser
 20
 21import (
 22	"encoding/xml"
 23)
 24
 25// RssFeed represents an rss web feed.
 26type RssFeed struct {
 27	// XMLName.
 28	XMLName xml.Name `xml:"rss"`
 29
 30	// Name of the channel (required).
 31	Title string `xml:"channel>title"`
 32
 33	// URL to the website (required).
 34	Link string `xml:"channel>link"`
 35
 36	// Description for the channel (required).
 37	Description string `xml:"channel>description"`
 38
 39	// Items for the feed (required).
 40	Items []RssItem `xml:"channel>item"`
 41
 42	// Language the channel is written in (optional).
 43	Language string `xml:"channel>language"`
 44
 45	// Copyright notice for the content (optional).
 46	Copyright string `xml:"channel>copyright"`
 47
 48	// Email address of the editor (optional).
 49	Editor string `xml:"channel>managingEditor"`
 50
 51	// Email address of the web master (optional).
 52	WebMaster string `xml:"channel>webMaster"`
 53
 54	// Publication date for the content (optional).
 55	PubDate string `xml:"channel>pubDate"`
 56
 57	// Last time the content was updated (optional).
 58	LastBuildDate string `xml:"channel>lastBuildDate"`
 59
 60	// Categories the feed belongs to (optional).
 61	Categories []RssCategory `xml:"channel>category"`
 62
 63	// Program used to generate the channel (optional).
 64	Generator string `xml:"channel>generator"`
 65
 66	// URL that points to documentation for the used format (optional).
 67	Docs string `xml:"channel>docs"`
 68
 69	// Cloud for update notifications (optional).
 70	Cloud RssCloud `xml:"channel>cloud"`
 71
 72	// How long the channel can be cached (optional).
 73	TTL int `xml:"channel>ttl"`
 74
 75	// Image that can be displayed with the channel (optional).
 76	Image RssImage `xml:"channel>image"`
 77
 78	// PICS rating for the channel (optional).
 79	Rating string `xml:"channel>rating"`
 80
 81	// Text input box related to the channel (optional).
 82	TextInput RssTextInput `xml:"channel>textInput"`
 83
 84	// Hint for aggregators telling them which hours can be skipped (optional).
 85	SkipHours []RssHour `xml:"channel>skipHours"`
 86
 87	// Hint for aggregators telling them which days can be skipped (optional).
 88	SkipDays []RssDay `xml:"channel>skipDays"`
 89}
 90
 91// RssItem represents an rss item.
 92type RssItem struct {
 93	// Title of the item (required if description isn't present).
 94	Title string `xml:"title"`
 95
 96	// The item synopsis (required if title isn't present).
 97	Description string `xml:"description"`
 98
 99	// The URL of the item (optional).
100	Link string `xml:"link"`
101
102	// Email address of the author of the item (optional).
103	Author string `xml:"author"`
104
105	// Includes item in one or more categories (optional).
106	Categories []RssCategory `xml:"category"`
107
108	// URL to a page for comments (optional).
109	Comments string `xml:"comments"`
110
111	// Media object that is attached to the item (optional).
112	Enclosure RssEnclosure `xml:"enclosure"`
113
114	// String that uniquely identifies the item (optional).
115	GUID string `xml:"guid"`
116
117	// Time the item was published (optional).
118	PubDate string `xml:"pubDate"`
119
120	// The RSS channel the item came from (optional).
121	Source RssSource `xml:"source"`
122}
123
124// RssEnclosure represents an rss enclosure.
125type RssEnclosure struct {
126	// Where the enclosure is located (required).
127	URL string `xml:"url,attr"`
128
129	// Size of the enclosure in bytes (required).
130	Length string `xml:"length,attr"`
131
132	// MIME type of the enclosure (required).
133	Type string `xml:"type,attr"`
134}
135
136// RssImage represents an rss image.
137type RssImage struct {
138	// URL to image that represents the channel (required).
139	URL string `xml:"url"`
140
141	// Title which describes the image (required).
142	Title string `xml:"title"`
143
144	// URL of the site itself (required).
145	Link string `xml:"link"`
146
147	// Width of the image (optional).
148	Width int `xml:"width"`
149
150	// Height of the image (optional).
151	Height int `xml:"height"`
152
153	// Additional description of the image (optional).
154	Description string `xml:"description"`
155}
156
157// RssCloud represents the rss cloud tag.
158type RssCloud struct {
159	// Domain cloud service is running on (required).
160	Domain string `xml:"domain,attr"`
161
162	// Port to use for TCP socket connection (required).
163	Port int `xml:"port,attr"`
164
165	// Path to use for the request (required).
166	Path string `xml:"path,attr"`
167
168	// Register procedure which should be used (required).
169	RegisterProcedure string `xml:"registerProcedure,attr"`
170
171	// Protocol used for registration et cetera (required).
172	Protocol string `xml:"protocol,attr"`
173}
174
175// RssCategory represents the rss category tag.
176type RssCategory struct {
177	// Human readable category name (required).
178	Name string `xml:",chardata"`
179
180	// Domain that identifies categorization taxonomy (optional).
181	Domain string `xml:"domain,attr"`
182}
183
184// RssTextInput represents the rss textInput tag.
185type RssTextInput struct {
186	// The label of the Submit button in the text input area (required).
187	Title string `xml:"title"`
188
189	// Explains the text input area (required).
190	Description string `xml:"description"`
191
192	// The name of the text object in the text input area (required).
193	Name string `xml:"name"`
194
195	// The URL of the CGI script that processes text input requests (required).
196	Link string `xml:"link"`
197}
198
199// RssSource represents the rss source tag.
200type RssSource struct {
201	// URL which links to the XMLization source (required).
202	URL string `xml:"url,attr"`
203
204	// Source name (required).
205	Name string `xml:",chardata"`
206}
207
208// RssHour represents the hour tag, a subelement of the skipHours tag.
209type RssHour struct {
210	// Number between 0 and 23 representing time in GMT (required).
211	Hour int `xml:"hour"`
212}
213
214// RssDay represents the day tag, a subelement of the skipDays tag.
215type RssDay struct {
216	// Weekday (e.g Monday) (required).
217	Day string `xml:"day"`
218}
219
220// parseRss parses an rss feed and returns a generic feed.
221func parseRss(data []byte) (f Feed, err error) {
222	var origFeed RssFeed
223	if err = unmarshal(data, &origFeed); err != nil {
224		return
225	}
226
227	f = Feed{
228		Type:        "rss",
229		Title:       origFeed.Title,
230		Link:        origFeed.Link,
231		Description: origFeed.Description,
232		Image:       origFeed.Image.URL,
233		Generator:   origFeed.Generator,
234		Rights:      origFeed.Copyright,
235		Author:      origFeed.Editor,
236	}
237
238	if len(origFeed.LastBuildDate) > 0 {
239		f.Updated, err = parseTime(origFeed.LastBuildDate)
240		if err != nil {
241			return
242		}
243	}
244
245	for _, category := range origFeed.Categories {
246		f.Categories = append(f.Categories, category.Name)
247	}
248
249	for _, entry := range origFeed.Items {
250		item := Item{
251			ID:         entry.GUID,
252			Title:      entry.Title,
253			Link:       entry.Link,
254			Content:    entry.Description,
255			Attachment: entry.Enclosure.URL,
256			Author:     entry.Author,
257		}
258
259		for _, category := range entry.Categories {
260			item.Categories = append(item.Categories, category.Name)
261		}
262
263		item.PubDate, err = parseTime(entry.PubDate)
264		if err != nil {
265			return
266		}
267
268		f.Items = append(f.Items, item)
269	}
270
271	return
272}