1/*
2 * Copyright © 2016-2019 Sören Tempel
3 *
4 * This program is free software: you can redistribute it and/or
5 * modify it under the terms of the GNU Affero General Public
6 * License as published by the Free Software Foundation, either
7 * version 3 of the License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Affero General Public License for more details.
13 *
14 * You should have received a copy of the GNU Affero General Public
15 * License along with this program. If not, see
16 * <http://www.gnu.org/licenses/>.
17 */
18
19#include <assert.h>
20#include <ctype.h>
21#include <errno.h>
22#include <limits.h>
23#include <pthread.h>
24#include <stdio.h>
25#include <stdlib.h>
26#include <string.h>
27
28#include <sys/types.h>
29
30#include "queue.h"
31#include "scanner.h"
32#include "token.h"
33#include "turing.h"
34#include "util.h"
35
36static void lexspace(scanner *);
37static void lexcomment(scanner *);
38static void lexstate(scanner *);
39static void lexterm(scanner *);
40
41/**
42 * Reads the next character from the input string and increments current
43 * position of the scanner in the input string.
44 *
45 * @param scr Scanner from which character should be read.
46 * @returns The next character or -1 if there is none.
47 */
48static int
49nextch(scanner *scr)
50{
51 if (scr->pos >= scr->inlen)
52 return -1;
53
54 return scr->input[scr->pos++];
55}
56
57/**
58 * Reads the next character from the input string but doesn't increment
59 * the current position of the scanner.
60 *
61 * @param scr Scanner from which character should be read.
62 * @returns The next character or -1 if there is none.
63 */
64static int
65peekch(scanner *scr)
66{
67 int nxt;
68
69 nxt = nextch(scr);
70 if (nxt != -1)
71 scr->pos--;
72 return nxt;
73}
74
75/**
76 * Ignores all characters in the input string after the current start position.
77 *
78 * @param scr Scanner for which characters should be ignored.
79 */
80static void
81ignore(scanner *scr)
82{
83 scr->start = scr->pos;
84}
85
86/**
87 * Whether or not the current character is a valid turing machine input
88 * symbol. Which is the case if it is either an alphanumeric character
89 * or the special blank character.
90 *
91 * @param c Character which should be examined.
92 * @returns Non-zero integer if it is, zero if it isn't.
93 */
94static int
95issymbol(int c)
96{
97 return isalnum(c) || c == BLANKCHAR;
98}
99
100/**
101 * Emits a new token and enqueues it.
102 *
103 * @param scr Scanner which found the token that should be emitted.
104 * @param tkt Type of the token.
105 * @param value Value of the token.
106 */
107static void
108emit(scanner *scr, toktype tkt, int value)
109{
110 token *tok;
111
112 tok = emalloc(sizeof(token));
113 tok->type = tkt;
114 tok->line = scr->line;
115 tok->column = scr->column;
116 tok->value = value;
117
118 enqueue(scr->tqueue, tok);
119 scr->start = scr->pos;
120}
121
122/**
123 * Tail recursive function parsing the entire input string.
124 * Usually passed to pthread_create(3).
125 *
126 * @param scr Pointer to the associated scanner.
127 */
128static void
129lexany(scanner *scr)
130{
131 int nxt;
132
133 /* Make this function a cancel point. */
134 pthread_testcancel();
135
136 scr->column++;
137 if ((nxt = nextch(scr)) == -1) {
138 scr->column = 0;
139 if (scr->line > 1)
140 scr->line--;
141
142 emit(scr, TOK_EOF, TOKNOP);
143 LEXRET(scr, NULL);
144 }
145
146 switch (nxt) {
147 case '\n':
148 scr->column = 0;
149 scr->line++;
150 ignore(scr);
151 LEXRET(scr, lexany);
152 case '#':
153 LEXRET(scr, lexcomment);
154 case ',':
155 emit(scr, TOK_COMMA, TOKAUTO(scr));
156 LEXRET(scr, lexany);
157 case ';':
158 emit(scr, TOK_SEMICOLON, TOKAUTO(scr));
159 LEXRET(scr, lexany);
160 case '{':
161 emit(scr, TOK_LBRACKET, TOKAUTO(scr));
162 LEXRET(scr, lexany);
163 case '}':
164 emit(scr, TOK_RBRACKET, TOKAUTO(scr));
165 LEXRET(scr, lexany);
166 case '<':
167 emit(scr, TOK_SMALLER, TOKAUTO(scr));
168 LEXRET(scr, lexany);
169 case '>':
170 emit(scr, TOK_GREATER, TOKAUTO(scr));
171 LEXRET(scr, lexany);
172 case '|':
173 emit(scr, TOK_PIPE, TOKAUTO(scr));
174 LEXRET(scr, lexany);
175 case 'q':
176 if (isdigit(peekch(scr)))
177 LEXRET(scr, lexstate);
178 break;
179 case '=':
180 if (nextch(scr) == '>')
181 emit(scr, TOK_NEXT, TOKNOP);
182 else
183 emit(scr, TOK_ERROR, ERR_UNKOWN);
184
185 scr->column++;
186 LEXRET(scr, lexany);
187 }
188
189 if (issymbol(nxt) && !issymbol(peekch(scr))) {
190 emit(scr, TOK_SYMBOL, TOKAUTO(scr));
191 LEXRET(scr, lexany);
192 } else if (isspace(nxt)) {
193 LEXRET(scr, lexspace);
194 }
195
196 LEXRET(scr, lexterm);
197}
198
199/**
200 * Skips/Ignores all white-space characters. Also
201 * calls lexany as soon as it encounters the first
202 * non-white-space character.
203 *
204 * @param scr Pointer to the associated scanner.
205 */
206static void
207lexspace(scanner *scr)
208{
209 while (isspace(peekch(scr))) {
210 nextch(scr);
211 scr->column++;
212 }
213
214 ignore(scr);
215 LEXRET(scr, lexany);
216}
217
218/**
219 * Skips/Ignores a comment. A comment begins with the
220 * ASCII symbol '#' and ends with a newline.
221 *
222 * @param scr Pointer to the associated scanner.
223 */
224static void
225lexcomment(scanner *scr)
226{
227 while (peekch(scr) != '\n')
228 if (nextch(scr) == -1)
229 break;
230
231 /* We don't need to increment scr->column here because
232 * comments are currently not exposed to the parser
233 * and since they end with a newline they don't effect
234 * other tokens either (unlike white-spaces). */
235
236 ignore(scr);
237 LEXRET(scr, lexany);
238}
239
240/**
241 * Lexes a state name. Also calls lexany as soon as
242 * it encounters a non-digit character. Besides an
243 * error token is emitted if converting the token
244 * name to an int would cause an integer over/underflow.
245 *
246 * @pre The previous char must be the ASCII character 'q'.
247 * @param scr Pointer to the associated scanner.
248 */
249static void
250lexstate(scanner *scr)
251{
252 char *input, buf[STATELEN];
253 unsigned int col;
254 size_t len;
255 long value;
256
257 input = &scr->input[scr->pos];
258
259 col = scr->column;
260 while (isdigit(peekch(scr)))
261 nextch(scr);
262 scr->column = col;
263
264 len = scr->pos - scr->start - 1;
265 if (len > STATELEN - 1) {
266 emit(scr, TOK_ERROR, ERR_OVERFLOW);
267 goto ret;
268 }
269
270 memcpy(buf, input, len);
271 buf[len] = '\0';
272
273 value = strtol(buf, NULL, 10);
274 if (value < 0)
275 emit(scr, TOK_ERROR, ERR_UNDERFLOW);
276 else if (value > INT_MAX)
277 emit(scr, TOK_ERROR, ERR_OVERFLOW);
278 else
279 emit(scr, TOK_STATE, (int)value);
280
281ret:
282 scr->column += (unsigned int)len; /* Initial 'q' and digits. */
283 LEXRET(scr, lexany);
284}
285
286/**
287 * Attempts to lex the terminal symbols `start:` and `accept:`. It calls
288 * ::lexany as soon as it reaches the end of the given terminal symbol
289 * or if a character is encountered which is not part of the any known
290 * terminal symbol.
291 *
292 * @param scr Pointer to the associated scanner.
293 */
294static void
295lexterm(scanner *scr)
296{
297 char *ter;
298 toktype tkt;
299 size_t pos, len;
300
301 switch (scr->input[scr->start]) {
302 case 's':
303 tkt = TOK_START;
304 ter = "start:";
305 break;
306 case 'a':
307 tkt = TOK_ACCEPT;
308 ter = "accept:";
309 break;
310 default:
311 emit(scr, TOK_ERROR, ERR_UNKOWN);
312 LEXRET(scr, lexany);
313 }
314
315 len = strlen(ter);
316 if (!xstrncmp(ter, &scr->input[scr->start], len, &pos)) {
317 emit(scr, tkt, TOKNOP);
318 } else {
319 /* pos should always be < strlen({start:,accept:}) */
320 assert(pos <= UINT_MAX - 1);
321
322 scr->column += (unsigned int)pos;
323 emit(scr, TOK_ERROR, ERR_UNEXPECTED);
324 }
325
326 scr->pos += --len;
327 scr->start = scr->pos;
328
329 scr->column += (unsigned int)len;
330 LEXRET(scr, lexany);
331}
332
333/**
334 * Function invoked by a seperate LWP for parsing the input string. This
335 * function simply calls the current ::statefn until the called
336 * ::statefn function sets the current state to `NULL`.
337 *
338 * @param pscr Void pointer to the scanner used for lexing.
339 * @returns A null pointer.
340 */
341static void *
342tokloop(void *pscr)
343{
344 scanner *scr;
345
346 scr = (scanner *)pscr;
347 while (scr->state != NULL)
348 (*scr->state)(scr); /* fn must set scr->state. */
349
350 return NULL;
351}
352
353/**
354 * Creates a new scanner for the given input and starts lexing the
355 * it in a seperated thread.
356 *
357 * @param input Input which should be scanned.
358 * @param len Length of the input.
359 * @returns Scanner for the given input.
360 */
361scanner *
362scanstr(char *input, size_t len)
363{
364 scanner *scr;
365
366 scr = emalloc(sizeof(scanner));
367 scr->tqueue = newqueue();
368 scr->state = lexany;
369 scr->pos = scr->start = scr->column = 0;
370 scr->inlen = len;
371 scr->input = input;
372 scr->line = 1;
373
374 if ((errno = pthread_create(&scr->thread, NULL, tokloop, (void *)scr)))
375 die("pthread_create failed");
376
377 return scr;
378}
379
380/**
381 * Frees the allocated memory for the given scanner.
382 *
383 * @param scr Scanner for which allocated memory should be freed.
384 */
385void
386freescanner(scanner *scr)
387{
388 assert(scr);
389
390 if (!pthread_cancel(scr->thread)) {
391 if ((errno = pthread_join(scr->thread, NULL)))
392 die("pthread_join failed");
393 }
394
395 freequeue(scr->tqueue);
396 free(scr);
397}
398
399/**
400 * Returns the least recent token scanned by the given scanner. If no
401 * token has been scanned so for this function blocks until a token has
402 * been emitted by the scanner. If the last token (TOK_EOF) was already
403 * returned by this function then it causes a deadlock.
404 *
405 * @pre A previous call of this function didn't return TOK_EOF.
406 * @param scr Scanner to extract token from.
407 */
408token *
409nexttoken(scanner *scr)
410{
411 return dequeue(scr->tqueue);
412}