1 module json.parser.lexer;
2 
3 private {
4     import json.value : JsonException;
5 
6     import std.algorithm;
7     import std.conv;
8     import std.functional;
9     import std.range;
10     import std.string;
11     import std.traits;
12     import std.uni;
13     import std.utf;
14 
15     alias Tokenizer = bool delegate( dchar, out JsonToken );
16     alias Predicate = bool delegate( dchar );
17 }
18 
19 package {
20     enum JsonToken.Type[dstring] Keywords = [
21         "null":  JsonToken.Type.Null,
22         "true":  JsonToken.Type.True,
23         "false": JsonToken.Type.False,
24     ];
25 
26     enum JsonToken.Type[dchar] Punctuation = [
27         '{': JsonToken.Type.LeftBrace,
28         '}': JsonToken.Type.RightBrace,
29         '[': JsonToken.Type.LeftSquare,
30         ']': JsonToken.Type.RightSquare,
31         ',': JsonToken.Type.Comma,
32         ':': JsonToken.Type.Colon,
33     ];
34 }
35 
36 final class JsonParserException : JsonException
37 {
38     private TextSpan _span;
39     TextSpan span() const pure nothrow @safe @property
40     {
41         return this._span;
42     }
43 
44 package:
45     this( TextSpan span, string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null )
46     {
47         this._span = span;
48         super( msg, file, line, next );
49     }
50 
51     this( JsonToken token, string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null )
52     {
53         this._span = token.span;
54         super( msg, file, line, next );
55     }
56 }
57 
58 package struct TextSpan
59 {
60     immutable size_t line;
61     immutable size_t column;
62     immutable size_t index;
63     immutable size_t length;
64 
65     this() @disable;
66     this( size_t line, size_t column, size_t index, size_t length = size_t.init )
67     {
68         this.line = line;
69         this.column = column;
70         this.index = index;
71         this.length = length;
72     }
73 
74     TextSpan withLength( size_t newLength )
75     {
76         return TextSpan( this.line, this.column, this.index, newLength );
77     }
78 }
79 
80 final package class JsonToken
81 {
82     enum Type
83     {
84         Identifier,
85 
86         String,
87         Number,
88 
89         True,
90         False,
91         Null,
92 
93         LeftSquare,
94         RightSquare,
95         LeftBrace,
96         RightBrace,
97 
98         Comma,
99         Colon,
100 
101         EndOfInput,
102     }
103 
104     enum NumberType
105     {
106         signed,
107         unsigned,
108         floating,
109     }
110 
111     private dstring _text;
112     private TextSpan _span;
113 
114     immutable Type type;
115     immutable NumberType numberType = void;
116 
117     dstring text() const @property
118     {
119         return this._text;
120     }
121 
122     TextSpan span() const @property
123     {
124         return this._span;
125     }
126 
127     this( Type type, dstring text, TextSpan span )
128     {
129         this.type = type;
130         this._text = text;
131         this._span = span;
132     }
133 
134     this( NumberType numType, dstring text, TextSpan span )
135     {
136         this.numberType = numType;
137         this( Type.Number, text, span );
138     }
139 
140     string identify()
141     {
142         import std.utf;
143         import std.conv;
144         import std.string;
145 
146         with( Type )
147         switch( this.type )
148         {
149             case Number:
150             case String:
151             case True:
152             case False:
153             case Null:
154                 return to!( string )( this.type ).toLower();
155 
156             case EndOfInput:
157                 return "end-of-input";
158 
159             default:
160                 return this.text.toUTF8();
161         }
162     }
163 }
164 
165 alias StandardCompliant = Flag!"JsonStandardCompliantParsing";
166 
167 final package class Lexer
168 {
169 private:
170     dstring source;
171     immutable size_t length;
172     size_t index;
173     size_t line;
174     size_t column;
175     TextSpan[] spans;
176     Tokenizer[] tokenizers;
177     immutable StandardCompliant standard;
178 
179     bool eof() const pure nothrow @safe @property
180     {
181         return this.index >= this.length;
182     }
183 
184     public this( S )( S source, StandardCompliant standardCompliant ) if( isSomeString!S )
185     {
186         this.source = source.toUTF32();
187         this.length = this.source.length;
188         this.standard = standardCompliant;
189         this.tokenizers = [
190             &this.tryLexString,
191             &this.tryLexNumber,
192             &this.tryLexWord,
193             &this.tryLexPunctuation,
194         ];
195     }
196 
197     public JsonToken[] tokenize()
198     {
199         JsonToken[] tokens;
200         while( !this.eof )
201         {
202             this.skipWhile( c => c.isWhite );
203 
204             if( !this.standard && ( this.trySkipSingleLineComment() || this.trySkipMultiLineComment() ) )
205                 continue;
206 
207             if( this.eof )
208                 break;
209 
210             JsonToken token;
211             if( this.tokenizers.any!( fn => fn( this.peek(), token ) ) )
212                 tokens ~= token;
213             else
214             {
215                 this.markStart();
216                 auto c = this.take();
217                 throw new JsonParserException( this.markEnd(), "Unexpected character '%s' (0x%04X)".format( c, c ) );
218             }
219         }
220 
221         this.markStart();
222         tokens ~= this.makeToken( JsonToken.Type.EndOfInput, null );
223         return tokens;
224     }
225 
226     bool trySkipSingleLineComment()
227     {
228         if( !this.isNext( "//" ) )
229             return false;
230 
231         this.skipWhile( c => c != '\n' );
232         return true;
233     }
234 
235     bool trySkipMultiLineComment()
236     {
237         if( !this.isNext( "/*" ) )
238             return false;
239 
240         this.markStart();
241         int level = 1;
242 
243         this.takeIfNext( "/*" );
244         while( !this.eof && level > 0 )
245         {
246             if( this.isNext( "*/" ) )
247             {
248                 level -= 1;
249                 this.takeIfNext( "*/" );
250                 this.markEnd();
251                 continue;
252             }
253 
254             if( this.isNext( "/*" ) )
255             {
256                 level += 1;
257                 this.markStart();
258                 this.takeIfNext( "/*" );
259                 continue;
260             }
261 
262             if( this.eof )
263                 break;
264 
265             this.take();
266         }
267 
268         // if this is true then we've reached EOF and there's an unterminated comment somewhere
269         if( level > 0 )
270             throw new JsonParserException(
271                 this.markEnd(),
272                 "unexpected end-of-input (unclosed multi-line comment)"
273             );
274 
275         return true;
276     }
277 
278     bool tryLexString( dchar c, out JsonToken token )
279     {
280         if( c != '"' && c != '\'' )
281             return false;
282 
283         if( c == '\'' && this.standard )
284             return false;
285 
286         this.markStart();
287         immutable terminator = this.take();
288 
289 
290         if( c == '\'' && this.standard )
291             throw new JsonParserException(
292                 this.markEnd(),
293                 "cannot use single-quoted strings in standard-compliant mode"
294             );
295 
296         dstring text;
297         dchar   next;
298         while( !this.eof && ( next = this.peek() ) != terminator )
299         {
300             if( this.standard && next.isControl && !next.isSpace )
301                 throw new JsonParserException(
302                     this.markEnd(),
303                     "Control characters (0x%04X) are not allowed in strings".format( next )
304                 );
305 
306             if( next == '\\' )
307             {
308                 text ~= this.handleEscapeSequence( this.take() );
309                 continue;
310             }
311 
312             text ~= this.take();
313         }
314 
315         if( this.eof )
316             throw new JsonParserException( this.markEnd(), "Unexpected end-of-input (unterminated string)" );
317 
318         assert( this.take() == terminator );
319 
320         token = this.makeToken( JsonToken.Type.String, text );
321         return true;
322     }
323 
324     bool tryLexNumber( dchar c, out JsonToken token )
325     {
326         if( !c.isNumber && c != '-' )
327             return false;
328 
329         bool hasDecimal;
330         bool hasExponent;
331         bool forceTake = c == '-';
332         bool signed = c == '-';
333 
334         bool pred( dchar ch )
335         {
336             if( forceTake )
337             {
338                 forceTake = false;
339                 return true;
340             }
341 
342             if( ch == '.' )
343             {
344                 if( hasDecimal )
345                     throw new JsonParserException( this.markEnd(), "Duplicate decimal point in number" );
346 
347                 hasDecimal = true;
348                 return this.peek( 1 ).isNumber;
349             }
350 
351             if( ch == 'e' || ch == 'E' )
352             {
353                 if( hasExponent )
354                     throw new JsonParserException( this.markEnd(), "Duplicate exponent in number" );
355 
356                 hasExponent = true;
357                 auto next = this.peek( 1 );
358                 if( next == '+' || next == '-' )
359                 {
360                     forceTake = true;
361                     return this.peek( 2 ).isNumber;
362                 }
363 
364                 return next.isNumber;
365             }
366 
367             return ch.isNumber;
368         }
369 
370         this.markStart();
371         auto text = this.takeWhile( &pred );
372         auto type = hasDecimal || hasExponent
373                   ? JsonToken.NumberType.floating
374                   : ( signed ? JsonToken.NumberType.signed
375                              : JsonToken.NumberType.unsigned );
376         token = this.makeToken( type, text );
377         return true;
378     }
379 
380     bool tryLexWord( dchar c, out JsonToken token )
381     {
382         if( !c.isAlpha && c != '_' )
383             return false;
384 
385         this.markStart();
386         auto text = this.takeWhile( ch => ch.isAlpha || ch.isNumber || ch == '_' );
387         if( auto type = text in Keywords )
388         {
389             token = this.makeToken( *type, text );
390             return true;
391         }
392 
393         if( !this.standard )
394         {
395             token = this.makeToken( JsonToken.Type.Identifier, text );
396             return true;
397         }
398 
399         throw new JsonParserException(
400             this.markEnd(),
401             "Unexpected '%s' (did you forget to quote an object key?)".format( text )
402         );
403     }
404 
405     bool tryLexPunctuation( dchar c, out JsonToken token )
406     {
407         foreach( ch, type; Punctuation )
408         {
409             if( c == ch )
410             {
411                 this.markStart();
412                 token = this.makeToken( type, [ this.take() ] );
413                 return true;
414             }
415         }
416 
417         return false;
418     }
419 
420     dchar handleEscapeSequence( dchar escape )
421     {
422         if( this.eof )
423             throw new JsonParserException(
424                 this.markEnd(),
425                 "Unexpected end-of-input following escape sequence in string"
426             );
427 
428         switch( escape )
429         {
430             case '"':
431             case '\\':
432             case '/':
433                 return escape;
434 
435             case 'b': return '\b';
436             case 'f': return '\f';
437             case 'n': return '\n';
438             case 'r': return '\r';
439             case 't': return '\t';
440 
441             case 'u':
442             {
443                 int i = -1;
444                 auto code = this.takeWhile( _ => ++i < 4 );
445 
446                 if( code.length < 4 )
447                     throw new JsonParserException( this.markEnd(), "Unexpected end-of-input following escape sequence in string" );
448 
449                 try
450                 {
451                     return code.to!( ushort )( 16 ).to!dchar;
452                 }
453                 catch( Throwable th )
454                 {
455                     throw new JsonParserException( this.markEnd(), th.msg, __FILE__, __LINE__, th );
456                 }
457             }
458 
459             default:
460                 throw new JsonParserException( this.markEnd(), "Unrecognized escape sequence '\\%s'".format( escape ) );
461         }
462     }
463 
464     void markStart()
465     {
466         this.spans ~= TextSpan( this.line, this.column, this.index );
467     }
468 
469     TextSpan markEnd()
470     {
471         auto span = this.spans.back;
472         this.spans.popBack();
473 
474         return span.withLength( this.index - span.index );
475     }
476 
477     JsonToken makeToken( JsonToken.Type type, dstring text )
478     {
479         return new JsonToken( type, text, this.markEnd() );
480     }
481 
482     JsonToken makeToken( JsonToken.NumberType numType, dstring text )
483     {
484         return new JsonToken( numType, text, this.markEnd() );
485     }
486 
487     dchar peek( int distance = 0 )
488     {
489         auto newIndex = this.index + distance;
490         if( newIndex < 0 || newIndex >= this.length )
491             return dchar.init;
492 
493         return this.source[newIndex];
494     }
495 
496     dchar take()
497     {
498         auto current   = this.peek();
499         immutable next = this.peek( 1 );
500 
501         if( current == '\r' )
502         {
503             if( next == '\n' )
504             {
505                 ++this.index;
506                 current = next;
507             }
508         }
509 
510         if( next == '\n' )
511         {
512             ++this.line;
513             this.column = 0;
514         }
515 
516         ++this.index;
517         ++this.column;
518 
519         return current;
520     }
521 
522     bool isNext( dstring search )
523     {
524         auto len = search.length;
525         if( this.index + len >= this.length )
526             return false;
527 
528         return this.source[this.index .. this.index + len] == search;
529     }
530 
531     bool takeIfNext( dstring search )
532     {
533         if( this.isNext( search ) )
534         {
535             foreach( _; 0 .. search.length )
536                 this.take();
537         }
538 
539         return false;
540     }
541 
542     dstring takeWhile( Predicate pred )
543     {
544         dstring result;
545         while( !this.eof && pred( this.peek() ) )
546             result ~= this.take();
547 
548         return result;
549     }
550 
551     void skipWhile( Predicate pred )
552     {
553         while( !this.eof && pred( this.peek() ) )
554             this.take();
555     }
556 }