1 module json.parser.lexer; 2 3 private { 4 import json.value : JsonException; 5 6 import std.algorithm; 7 import std.conv; 8 import std.functional; 9 import std.range; 10 import std.string; 11 import std.traits; 12 import std.uni; 13 import std.utf; 14 15 alias Tokenizer = bool delegate( dchar, out JsonToken ); 16 alias Predicate = bool delegate( dchar ); 17 } 18 19 package { 20 enum JsonToken.Type[dstring] Keywords = [ 21 "null": JsonToken.Type.Null, 22 "true": JsonToken.Type.True, 23 "false": JsonToken.Type.False, 24 ]; 25 26 enum JsonToken.Type[dchar] Punctuation = [ 27 '{': JsonToken.Type.LeftBrace, 28 '}': JsonToken.Type.RightBrace, 29 '[': JsonToken.Type.LeftSquare, 30 ']': JsonToken.Type.RightSquare, 31 ',': JsonToken.Type.Comma, 32 ':': JsonToken.Type.Colon, 33 ]; 34 } 35 36 final class JsonParserException : JsonException 37 { 38 private TextSpan _span; 39 TextSpan span() const pure nothrow @safe @property 40 { 41 return this._span; 42 } 43 44 package: 45 this( TextSpan span, string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null ) 46 { 47 this._span = span; 48 super( msg, file, line, next ); 49 } 50 51 this( JsonToken token, string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null ) 52 { 53 this._span = token.span; 54 super( msg, file, line, next ); 55 } 56 } 57 58 package struct TextSpan 59 { 60 immutable size_t line; 61 immutable size_t column; 62 immutable size_t index; 63 immutable size_t length; 64 65 this() @disable; 66 this( size_t line, size_t column, size_t index, size_t length = size_t.init ) 67 { 68 this.line = line; 69 this.column = column; 70 this.index = index; 71 this.length = length; 72 } 73 74 TextSpan withLength( size_t newLength ) 75 { 76 return TextSpan( this.line, this.column, this.index, newLength ); 77 } 78 } 79 80 final package class JsonToken 81 { 82 enum Type 83 { 84 Identifier, 85 86 String, 87 Number, 88 89 True, 90 False, 91 Null, 92 93 LeftSquare, 94 RightSquare, 95 LeftBrace, 96 RightBrace, 97 98 Comma, 99 Colon, 100 101 EndOfInput, 102 } 103 104 enum NumberType 105 { 106 signed, 107 unsigned, 108 floating, 109 } 110 111 private dstring _text; 112 private TextSpan _span; 113 114 immutable Type type; 115 immutable NumberType numberType = void; 116 117 dstring text() const @property 118 { 119 return this._text; 120 } 121 122 TextSpan span() const @property 123 { 124 return this._span; 125 } 126 127 this( Type type, dstring text, TextSpan span ) 128 { 129 this.type = type; 130 this._text = text; 131 this._span = span; 132 } 133 134 this( NumberType numType, dstring text, TextSpan span ) 135 { 136 this.numberType = numType; 137 this( Type.Number, text, span ); 138 } 139 140 string identify() 141 { 142 import std.utf; 143 import std.conv; 144 import std.string; 145 146 with( Type ) 147 switch( this.type ) 148 { 149 case Number: 150 case String: 151 case True: 152 case False: 153 case Null: 154 return to!( string )( this.type ).toLower(); 155 156 case EndOfInput: 157 return "end-of-input"; 158 159 default: 160 return this.text.toUTF8(); 161 } 162 } 163 } 164 165 alias StandardCompliant = Flag!"JsonStandardCompliantParsing"; 166 167 final package class Lexer 168 { 169 private: 170 dstring source; 171 immutable size_t length; 172 size_t index; 173 size_t line; 174 size_t column; 175 TextSpan[] spans; 176 Tokenizer[] tokenizers; 177 immutable StandardCompliant standard; 178 179 bool eof() const pure nothrow @safe @property 180 { 181 return this.index >= this.length; 182 } 183 184 public this( S )( S source, StandardCompliant standardCompliant ) if( isSomeString!S ) 185 { 186 this.source = source.toUTF32(); 187 this.length = this.source.length; 188 this.standard = standardCompliant; 189 this.tokenizers = [ 190 &this.tryLexString, 191 &this.tryLexNumber, 192 &this.tryLexWord, 193 &this.tryLexPunctuation, 194 ]; 195 } 196 197 public JsonToken[] tokenize() 198 { 199 JsonToken[] tokens; 200 while( !this.eof ) 201 { 202 this.skipWhile( c => c.isWhite ); 203 204 if( !this.standard && ( this.trySkipSingleLineComment() || this.trySkipMultiLineComment() ) ) 205 continue; 206 207 if( this.eof ) 208 break; 209 210 JsonToken token; 211 if( this.tokenizers.any!( fn => fn( this.peek(), token ) ) ) 212 tokens ~= token; 213 else 214 { 215 this.markStart(); 216 auto c = this.take(); 217 throw new JsonParserException( this.markEnd(), "Unexpected character '%s' (0x%04X)".format( c, c ) ); 218 } 219 } 220 221 this.markStart(); 222 tokens ~= this.makeToken( JsonToken.Type.EndOfInput, null ); 223 return tokens; 224 } 225 226 bool trySkipSingleLineComment() 227 { 228 if( !this.isNext( "//" ) ) 229 return false; 230 231 this.skipWhile( c => c != '\n' ); 232 return true; 233 } 234 235 bool trySkipMultiLineComment() 236 { 237 if( !this.isNext( "/*" ) ) 238 return false; 239 240 this.markStart(); 241 int level = 1; 242 243 this.takeIfNext( "/*" ); 244 while( !this.eof && level > 0 ) 245 { 246 if( this.isNext( "*/" ) ) 247 { 248 level -= 1; 249 this.takeIfNext( "*/" ); 250 this.markEnd(); 251 continue; 252 } 253 254 if( this.isNext( "/*" ) ) 255 { 256 level += 1; 257 this.markStart(); 258 this.takeIfNext( "/*" ); 259 continue; 260 } 261 262 if( this.eof ) 263 break; 264 265 this.take(); 266 } 267 268 // if this is true then we've reached EOF and there's an unterminated comment somewhere 269 if( level > 0 ) 270 throw new JsonParserException( 271 this.markEnd(), 272 "unexpected end-of-input (unclosed multi-line comment)" 273 ); 274 275 return true; 276 } 277 278 bool tryLexString( dchar c, out JsonToken token ) 279 { 280 if( c != '"' && c != '\'' ) 281 return false; 282 283 if( c == '\'' && this.standard ) 284 return false; 285 286 this.markStart(); 287 immutable terminator = this.take(); 288 289 290 if( c == '\'' && this.standard ) 291 throw new JsonParserException( 292 this.markEnd(), 293 "cannot use single-quoted strings in standard-compliant mode" 294 ); 295 296 dstring text; 297 dchar next; 298 while( !this.eof && ( next = this.peek() ) != terminator ) 299 { 300 if( this.standard && next.isControl && !next.isSpace ) 301 throw new JsonParserException( 302 this.markEnd(), 303 "Control characters (0x%04X) are not allowed in strings".format( next ) 304 ); 305 306 if( next == '\\' ) 307 { 308 text ~= this.handleEscapeSequence( this.take() ); 309 continue; 310 } 311 312 text ~= this.take(); 313 } 314 315 if( this.eof ) 316 throw new JsonParserException( this.markEnd(), "Unexpected end-of-input (unterminated string)" ); 317 318 assert( this.take() == terminator ); 319 320 token = this.makeToken( JsonToken.Type.String, text ); 321 return true; 322 } 323 324 bool tryLexNumber( dchar c, out JsonToken token ) 325 { 326 if( !c.isNumber && c != '-' ) 327 return false; 328 329 bool hasDecimal; 330 bool hasExponent; 331 bool forceTake = c == '-'; 332 bool signed = c == '-'; 333 334 bool pred( dchar ch ) 335 { 336 if( forceTake ) 337 { 338 forceTake = false; 339 return true; 340 } 341 342 if( ch == '.' ) 343 { 344 if( hasDecimal ) 345 throw new JsonParserException( this.markEnd(), "Duplicate decimal point in number" ); 346 347 hasDecimal = true; 348 return this.peek( 1 ).isNumber; 349 } 350 351 if( ch == 'e' || ch == 'E' ) 352 { 353 if( hasExponent ) 354 throw new JsonParserException( this.markEnd(), "Duplicate exponent in number" ); 355 356 hasExponent = true; 357 auto next = this.peek( 1 ); 358 if( next == '+' || next == '-' ) 359 { 360 forceTake = true; 361 return this.peek( 2 ).isNumber; 362 } 363 364 return next.isNumber; 365 } 366 367 return ch.isNumber; 368 } 369 370 this.markStart(); 371 auto text = this.takeWhile( &pred ); 372 auto type = hasDecimal || hasExponent 373 ? JsonToken.NumberType.floating 374 : ( signed ? JsonToken.NumberType.signed 375 : JsonToken.NumberType.unsigned ); 376 token = this.makeToken( type, text ); 377 return true; 378 } 379 380 bool tryLexWord( dchar c, out JsonToken token ) 381 { 382 if( !c.isAlpha && c != '_' ) 383 return false; 384 385 this.markStart(); 386 auto text = this.takeWhile( ch => ch.isAlpha || ch.isNumber || ch == '_' ); 387 if( auto type = text in Keywords ) 388 { 389 token = this.makeToken( *type, text ); 390 return true; 391 } 392 393 if( !this.standard ) 394 { 395 token = this.makeToken( JsonToken.Type.Identifier, text ); 396 return true; 397 } 398 399 throw new JsonParserException( 400 this.markEnd(), 401 "Unexpected '%s' (did you forget to quote an object key?)".format( text ) 402 ); 403 } 404 405 bool tryLexPunctuation( dchar c, out JsonToken token ) 406 { 407 foreach( ch, type; Punctuation ) 408 { 409 if( c == ch ) 410 { 411 this.markStart(); 412 token = this.makeToken( type, [ this.take() ] ); 413 return true; 414 } 415 } 416 417 return false; 418 } 419 420 dchar handleEscapeSequence( dchar escape ) 421 { 422 if( this.eof ) 423 throw new JsonParserException( 424 this.markEnd(), 425 "Unexpected end-of-input following escape sequence in string" 426 ); 427 428 switch( escape ) 429 { 430 case '"': 431 case '\\': 432 case '/': 433 return escape; 434 435 case 'b': return '\b'; 436 case 'f': return '\f'; 437 case 'n': return '\n'; 438 case 'r': return '\r'; 439 case 't': return '\t'; 440 441 case 'u': 442 { 443 int i = -1; 444 auto code = this.takeWhile( _ => ++i < 4 ); 445 446 if( code.length < 4 ) 447 throw new JsonParserException( this.markEnd(), "Unexpected end-of-input following escape sequence in string" ); 448 449 try 450 { 451 return code.to!( ushort )( 16 ).to!dchar; 452 } 453 catch( Throwable th ) 454 { 455 throw new JsonParserException( this.markEnd(), th.msg, __FILE__, __LINE__, th ); 456 } 457 } 458 459 default: 460 throw new JsonParserException( this.markEnd(), "Unrecognized escape sequence '\\%s'".format( escape ) ); 461 } 462 } 463 464 void markStart() 465 { 466 this.spans ~= TextSpan( this.line, this.column, this.index ); 467 } 468 469 TextSpan markEnd() 470 { 471 auto span = this.spans.back; 472 this.spans.popBack(); 473 474 return span.withLength( this.index - span.index ); 475 } 476 477 JsonToken makeToken( JsonToken.Type type, dstring text ) 478 { 479 return new JsonToken( type, text, this.markEnd() ); 480 } 481 482 JsonToken makeToken( JsonToken.NumberType numType, dstring text ) 483 { 484 return new JsonToken( numType, text, this.markEnd() ); 485 } 486 487 dchar peek( int distance = 0 ) 488 { 489 auto newIndex = this.index + distance; 490 if( newIndex < 0 || newIndex >= this.length ) 491 return dchar.init; 492 493 return this.source[newIndex]; 494 } 495 496 dchar take() 497 { 498 auto current = this.peek(); 499 immutable next = this.peek( 1 ); 500 501 if( current == '\r' ) 502 { 503 if( next == '\n' ) 504 { 505 ++this.index; 506 current = next; 507 } 508 } 509 510 if( next == '\n' ) 511 { 512 ++this.line; 513 this.column = 0; 514 } 515 516 ++this.index; 517 ++this.column; 518 519 return current; 520 } 521 522 bool isNext( dstring search ) 523 { 524 auto len = search.length; 525 if( this.index + len >= this.length ) 526 return false; 527 528 return this.source[this.index .. this.index + len] == search; 529 } 530 531 bool takeIfNext( dstring search ) 532 { 533 if( this.isNext( search ) ) 534 { 535 foreach( _; 0 .. search.length ) 536 this.take(); 537 } 538 539 return false; 540 } 541 542 dstring takeWhile( Predicate pred ) 543 { 544 dstring result; 545 while( !this.eof && pred( this.peek() ) ) 546 result ~= this.take(); 547 548 return result; 549 } 550 551 void skipWhile( Predicate pred ) 552 { 553 while( !this.eof && pred( this.peek() ) ) 554 this.take(); 555 } 556 }