Current Version: 1.0.10
Project Name: csspp
lexer.cpp
Go to the documentation of this file.
1 // CSS Preprocessor
2 // Copyright (C) 2015-2016 Made to Order Software Corp.
3 //
4 // This program is free software; you can redistribute it and/or modify
5 // it under the terms of the GNU General Public License as published by
6 // the Free Software Foundation; either version 2 of the License, or
7 // (at your option) any later version.
8 //
9 // This program is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 // GNU General Public License for more details.
13 //
14 // You should have received a copy of the GNU General Public License
15 // along with this program; if not, write to the Free Software
16 // Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 
41 #include "csspp/lexer.h"
42 
43 #include "csspp/exceptions.h"
44 #include "csspp/unicode_range.h"
45 
46 #include <cmath>
47 #include <cstdio>
48 #include <iostream>
49 
50 namespace csspp
51 {
52 
53 lexer::lexer(std::istream & in, position const & pos)
54  : f_in(in)
55  , f_position(pos)
56  , f_start_position(pos)
57 {
58 }
59 
61 {
62  for(;;)
63  {
65 
66  wide_char_t const c(getc());
67 
68 //std::cerr << "--- got char " << std::hex << " 0x" << c << "\n";
69 
70  switch(c)
71  {
72  case EOF: // CSS uses 0xFFFD to represent EOF, we do not
74 
75  case '=':
76  {
77  wide_char_t const n(getc());
78  if(n != '=')
79  {
80  ungetc(n);
81  }
82  else
83  {
84  // really warn about it?
86  << "we accepted '==' instead of '=' in an expression, you probably want to change the operator to just '=', though."
88  }
89 
91  }
92 
93  case ',':
95 
96  case ':':
97  {
98  wide_char_t const n(getc());
99  if(n == '=')
100  {
102  }
103  ungetc(n);
105  }
106 
107  case ';':
109 
110  case '!':
111  {
112  wide_char_t const n(getc());
113  if(n == '=')
114  {
116  }
117  ungetc(n);
119  }
120 
121  case '?':
123 
124  case '>':
125  {
126  wide_char_t const n(getc());
127  if(n == '=')
128  {
130  }
131  ungetc(n);
133  }
134 
135  case '(':
137 
138  case ')':
140 
141  case '[':
143 
144  case ']':
146 
147  case '{':
149 
150  case '}':
152 
153  case '.':
154  {
155  wide_char_t const n(getc());
156  ungetc(n);
157  if(n >= '0' && n <= '9')
158  {
159  // found a decimal number
160  return number(c);
161  }
163  }
164  //NOTREACHED
165 
166  case '&':
167  {
168  wide_char_t const n(getc());
169  if(n == '&')
170  {
172  }
173  ungetc(n);
175  }
176 
177  case '<':
178  {
179  wide_char_t const n(getc());
180  if(n == '!')
181  {
182  wide_char_t const p(getc());
183  if(p == '-')
184  {
185  wide_char_t const l(getc());
186  if(l == '-')
187  {
189  }
190  ungetc(l);
191  }
192  ungetc(p);
193  }
194  else if(n == '=')
195  {
197  }
198  ungetc(n);
200  }
201  break;
202 
203  case '+':
204  {
205  wide_char_t const n(getc());
206  if(n >= '0' && n <= '9')
207  {
208  // found a positive number
209  ungetc(n);
210  return number(c);
211  }
212  if(n == '.')
213  {
214  wide_char_t const p(getc());
215  if(p >= '0' && p <= '9')
216  {
217  // found a negative decimal number
218  ungetc(p);
219  ungetc(n);
220  return number(c);
221  }
222  ungetc(p);
223  }
224  ungetc(n);
226  }
227  //NOTREACHED
228 
229  case '-':
230  {
231  wide_char_t const n(getc());
232  if(n >= '0' && n <= '9')
233  {
234  // found a negative number
235  ungetc(n);
236  return number(c);
237  }
238  if(n == '.')
239  {
240  wide_char_t const p(getc());
241  if(p >= '0' && p <= '9')
242  {
243  // found a negative decimal number
244  ungetc(p);
245  ungetc(n);
246  return number(c);
247  }
248  ungetc(p);
249  }
250  if(n == '-')
251  {
252  wide_char_t const p(getc());
253  if(p == '>')
254  {
256  }
257  ungetc(p);
258  ungetc(n);
259  // an identifier cannot start with two dashes in a row
261  }
262  ungetc(n);
263  if((is_identifier(n) || n == '\\')
264  && (n < '0' || n > '9'))
265  {
266  return identifier(c);
267  }
269  }
270  //NOTREACHED
271 
272  case '^':
273  {
274  wide_char_t const n(getc());
275  if(n == '=')
276  {
278  }
279  ungetc(n);
280  // character necessary by itself?
281  }
282  break;
283 
284  case '$':
285  {
286  wide_char_t const n(getc());
287  if(n == '=')
288  {
290  }
291  if(is_variable(n))
292  {
293  return variable(n);
294  }
295  ungetc(n);
297  }
298  //NOTREACHED
299 
300  case '~':
301  {
302  wide_char_t const n(getc());
303  if(n == '=')
304  {
306  }
307  ungetc(n);
309  }
310  break;
311 
312  case '*':
313  {
314  wide_char_t const n(getc());
315  if(n == '=')
316  {
318  }
319  if(n == '*')
320  {
322  }
323  ungetc(n);
325  }
326  //NOTREACHED
327 
328  case '|':
329  {
330  wide_char_t const n(getc());
331  if(n == '|')
332  {
334  }
335  if(n == '=')
336  {
338  }
339  ungetc(n);
340  // the pipe is used as a scoping operator "<name>|<name>"
342  }
343  break;
344 
345  case '"':
346  case '\'':
347  {
348  std::string const str(string(c));
350  n->set_string(str);
351  return n;
352  }
353  //NOTREACHED
354 
355  case '/':
356  {
357  wide_char_t const n(getc());
358  if(n == '*')
359  {
360  node::pointer_t cn(comment(true));
361  if(cn)
362  {
363  return cn;
364  }
365  // silently let it go
366  continue;
367  }
368  else if(n == '/')
369  {
370  node::pointer_t cn(comment(false));
371  if(cn)
372  {
375  << "C++ comments should not be preserved as they are not supported by most CSS parsers."
377  return cn;
378  }
379  // silently let it go
380  continue;
381  }
382  ungetc(n);
384  }
385 
386  case ' ':
387  case '\t':
388  case '\n':
389  //case '\r': -- not needed since \r is transformed into \n by getc()
390  case '\f':
391  {
392  // white spaces are signification in some places and
393  // definitively not acceptable in others so we have to
394  // create a token for them... this is important for the
395  // parser, not so much for the output
396  for(;;)
397  {
398  wide_char_t const n(getc());
399  if(!is_space(n))
400  {
401  ungetc(n);
403  }
404  }
405  }
406  //NOTREACHED
407 
408  case '0':
409  case '1':
410  case '2':
411  case '3':
412  case '4':
413  case '5':
414  case '6':
415  case '7':
416  case '8':
417  case '9':
418  return number(c);
419 
420  case '#':
421  {
422  node::pointer_t n(hash());
423  if(n)
424  {
425  return n;
426  }
427  continue;
428  }
429 
430  case '%':
431  {
432  wide_char_t const n(getc());
433  if(!is_start_identifier(n))
434  {
436  }
437  ungetc(n);
438  }
439  /*FALLTHROUGH*/
440  case '\\':
441  case '@':
442  {
444  if(!n->is(node_type_t::EOF_TOKEN))
445  {
446  return n;
447  }
448  // EOF_TOKEN is not returned, we may not be at the end of
449  // the input stream, but that identifier was empty; the
450  // identifier() function already generated an error
451  continue;
452  }
453  break;
454 
455  case 'u':
456  case 'U':
457  {
458  wide_char_t const n(getc());
459  if(n == '+')
460  {
461  wide_char_t const d(getc());
462  if(is_hex(d) || d == '?')
463  {
464  // U+<number>
465  return unicode_range(d);
466  }
467  ungetc(d);
468  }
469  ungetc(n);
470  return identifier(c);
471  }
472  //NOTREACHED
473 
474  default:
475  if(is_start_identifier(c))
476  {
477  return identifier(c);
478  }
479  break;
480 
481  }
482 
483  error::instance() << f_start_position << "invalid input character: U+" << error_mode_t::ERROR_HEX << c << "." << error_mode_t::ERROR_ERROR;
484  }
485 }
486 
487 wide_char_t lexer::mbtowc(char const * s)
488 {
489  unsigned char c(static_cast<unsigned char>(*s));
490  if(c < 0x80)
491  {
492  // ASCII is the same in UTF-8
493  return c;
494  }
495  wide_char_t wc(0);
496  size_t cnt(0);
497  if(c >= 0xF0)
498  {
499  if(c >= 0xF8)
500  {
501  error::instance() << f_start_position << "byte U+" << error_mode_t::ERROR_HEX << c << " not valid in a UTF-8 stream." << error_mode_t::ERROR_ERROR;
502  return 0xFFFD;
503  }
504  wc = c & 0x07;
505  cnt = 3;
506  }
507  else if(c >= 0xE0)
508  {
509  wc = c & 0x0F;
510  cnt = 2;
511  }
512  else if(c >= 0xC0)
513  {
514  wc = c & 0x1F;
515  cnt = 1;
516  }
517  else
518  {
519  error::instance() << f_start_position << "byte U+" << error_mode_t::ERROR_HEX << c << " not valid to introduce a UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
520  return 0xFFFD;
521  }
522 
523  for(++s; cnt > 0; --cnt, ++s)
524  {
525  // skip one character
526  c = static_cast<unsigned char>(*s);
527  if(c == '\0')
528  {
529  error::instance() << f_start_position << "sequence of bytes too short to represent a valid UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
530  return 0xFFFD;
531  }
532  if(c < 0x80 || c > 0xBF)
533  {
534  error::instance() << f_start_position << "invalid sequence of bytes, it cannot represent a valid UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
535  return 0xFFFD;
536  }
537  wc = (wc << 6) | (c & 0x3F);
538  }
539  if(*s != '\0')
540  {
541  error::instance() << f_start_position << "sequence of bytes too long, it cannot represent a valid UTF-8 encoded character." << error_mode_t::ERROR_ERROR;
542  return 0xFFFD;
543  }
544 
545  return wc;
546 }
547 
548 void lexer::wctomb(wide_char_t const wc, char * mb, size_t max_length)
549 {
550  // require a buffer large enough for the longest acceptable UTF-8 code
551  if(max_length < 5)
552  {
553  // this is an internal (misuse) error
554  throw csspp_exception_overflow("buffer too small to convert a wc to UTF-8.");
555  }
556 
557  // in case of error, make sure the string is empty
558  mb[0] = '\0';
559 
560  if(static_cast<wide_uchar_t>(wc) < 0x80)
561  {
562  // this would also encode '\0'... although it gets converted to 0xFFFD
563  mb[0] = static_cast<char>(wc);
564  mb[1] = '\0';
565  return;
566  }
567  if(static_cast<wide_uchar_t>(wc) < 0x800)
568  {
569  mb[0] = static_cast<char>((wc >> 6) | 0xC0);
570  mb[1] = (wc & 0x3F) | 0x80;
571  mb[2] = '\0';
572  return;
573  }
574  if(static_cast<wide_uchar_t>(wc) < 0x10000)
575  {
576  if(wc >= 0xD800 && wc <= 0xDFFF)
577  {
578  error::instance() << f_start_position << "surrogate characters cannot be encoded in UTF-8." << error_mode_t::ERROR_ERROR;
579  return;
580  }
581  if(wc == 0xFFFE || wc == 0xFFFF)
582  {
583  error::instance() << f_start_position << "characters 0xFFFE and 0xFFFF are not valid." << error_mode_t::ERROR_ERROR;
584  return;
585  }
586 
587  mb[0] = static_cast<char>((wc >> 12) | 0xE0);
588  mb[1] = ((wc >> 6) & 0x3F) | 0x80;
589  mb[2] = (wc & 0x3F) | 0x80;
590  mb[3] = '\0';
591  return;
592  }
593  if(static_cast<wide_uchar_t>(wc) < 0x110000)
594  {
595  if((wc & 0xFFFF) == 0xFFFE || (wc & 0xFFFF) == 0xFFFF)
596  {
597  error::instance() << f_start_position << "any characters that end with 0xFFFE or 0xFFFF are not valid." << error_mode_t::ERROR_ERROR;
598  return;
599  }
600  mb[0] = static_cast<char>((wc >> 18) | 0xF0);
601  mb[1] = ((wc >> 12) & 0x3F) | 0x80;
602  mb[2] = ((wc >> 6) & 0x3F) | 0x80;
603  mb[3] = (wc & 0x3F) | 0x80;
604  mb[4] = '\0';
605  return;
606  }
607 
608  error::instance() << f_start_position << "character too large, it cannot be encoded in UTF-8." << error_mode_t::ERROR_ERROR;
609 }
610 
611 std::string lexer::wctomb(wide_char_t const wc)
612 {
613  char mb[6];
614  wctomb(wc, mb, sizeof(mb) / sizeof(mb[0]));
615  return mb;
616 }
617 
619 {
620  wide_char_t c(0);
621 
622  // do we have characters in our unget buffer?
623  if(f_ungetc_pos > 0)
624  {
625  // yes, retrieve the character from the last ungetc()
626  --f_ungetc_pos;
627  c = f_ungetc[f_ungetc_pos];
628  }
629  else
630  {
631  // no, read the next character from the input stream
632  c = f_in.get();
633  if(c >= 0x80)
634  {
635  // here we cleanly accept very long sequences
636  if(c >= 0xC0 && c < 0xFF)
637  {
638  // starts as expected, now read the following byte sequence
639  // for that UTF-8 character
640  char mb[8];
641  mb[0] = c;
642  for(size_t i(1);; ++i)
643  {
644  if(i >= sizeof(mb) / sizeof(mb[0]))
645  {
646  // remove the whole invalid sequence (this could be
647  // a character that is too long)
648  for(c = f_in.get(); c >= 0x80 && c <= 0xBF; c = f_in.get());
649  if(c != EOF)
650  {
651  f_in.unget();
652  }
653  error::instance() << f_start_position << "too many follow bytes, it cannot represent a valid UTF-8 character." << error_mode_t::ERROR_ERROR;
654  return 0xFFFD;
655  }
656  c = f_in.get();
657  if(c < 0x80 || c > 0xBF) // the test c < 0x80 includes EOF
658  {
659  if(c != EOF)
660  {
661  // make sure we do not lose the next byte
662  f_in.unget();
663  }
664  mb[i] = '\0';
665  break;
666  }
667  mb[i] = c;
668  }
669  c = mbtowc(mb);
670  }
671  else
672  {
673  error::instance() << f_start_position << "unexpected byte in input buffer: U+" << error_mode_t::ERROR_HEX << c << "." << error_mode_t::ERROR_ERROR;
674  for(c = f_in.get(); c >= 0x80 && c <= 0xBF; c = f_in.get());
675  if(c != EOF)
676  {
677  f_in.unget();
678  }
679  return 0xFFFD;
680  }
681  }
682 
683  // special case for the "\n\r" sequence
684  if(c == '\r')
685  {
687  c = f_in.get();
688  if(c != '\n')
689  {
690  f_in.unget();
691  }
692  return '\n'; // simplify the rest of the lexer
693  }
694  else if(c == '\n')
695  {
697  return '\n';
698  }
699  else if(c == '\f')
700  {
701  // most editors probably don't count pages and lines...
703  return '\n'; // simplify the rest of the lexer
704  }
705  }
706 
707  // invalid character read? if so convert to 0xFFFD
708  if(c == '\0')
709  {
710  return 0xFFFD;
711  }
712 
713  return c;
714 }
715 
717 {
718  // ignore EOF
719  if(c == EOF || c == 0xFFFD)
720  {
721  return;
722  }
723 
724  // make sure only valid characters are ungotten
725  if(c < 0 || c > 0x10FFFF)
726  {
727  // this error should never happen
728  throw csspp_exception_logic("lexer called ungetc() with a character out of range."); // LCOV_EXCL_LINE
729  }
730 
731  // make sure we do not overflow the buffer
732  if(f_ungetc_pos >= sizeof(f_ungetc) / sizeof(f_ungetc[0]))
733  {
734  // this error should never happen
735  throw csspp_exception_logic("lexer called ungetc() too many times and ran out of space"); // LCOV_EXCL_LINE
736  }
737 
738  // push c in the unget buffer
739  f_ungetc[f_ungetc_pos] = c;
740 
741  ++f_ungetc_pos;
742 }
743 
745 {
746  if(c >= '0' && c <= '9')
747  {
748  return c - '0';
749  }
750  if(c >= 'A' && c <= 'F')
751  {
752  return c - 'A' + 10;
753  }
754  if(c >= 'a' && c <= 'f')
755  {
756  return c - 'a' + 10;
757  }
758 
759  // this error should never happen
760  throw csspp_exception_logic("hex_to_dec() called with an invalid digit."); // LCOV_EXCL_LINE
761 }
762 
764 {
765  wide_char_t c(getc());
766  if(c == '\n')
767  {
768  // this is not allowed here
769  error::instance() << f_start_position << "spurious newline character after a \\ character outside of a string." << error_mode_t::ERROR_ERROR;
770  return 0xFFFD;
771  }
772  if(c == 0xFFFD)
773  {
774  // this is not allowed here
775  error::instance() << f_start_position << "invalid character after a \\ character." << error_mode_t::ERROR_ERROR;
776  return 0xFFFD;
777  }
778  if(c == EOF)
779  {
780  // this is considered valid in standard CSS
781  error::instance() << f_start_position << "found EOF right after \\." << error_mode_t::ERROR_ERROR;
782  return 0xFFFD;
783  }
784 
785  // convert from hexadecimal?
786  if(is_hex(c))
787  {
788  wide_char_t wc(hex_to_dec(c));
789  for(int count(1); count < 6; ++count)
790  {
791  c = getc();
792  if(!is_hex(c))
793  {
794  // the following space must be eaten!
795  if(!is_space(c))
796  {
797  // but other characters we keep
798  ungetc(c);
799  }
800  break;
801  }
802  wc = wc * 16 + hex_to_dec(c);
803  if(wc >= 0x110000)
804  {
805  error::instance() << f_start_position << "escape character too large for Unicode." << error_mode_t::ERROR_ERROR;
806  return 0xFFFD;
807  }
808  }
809  if(wc == 0)
810  {
811  error::instance() << f_start_position << "escape character '\\0' is not acceptable in CSS." << error_mode_t::ERROR_ERROR;
812  return 0xFFFD;
813  }
814  return wc;
815  }
816  else
817  {
818  // c is the character being escaped
819  return c;
820  }
821 }
822 
824 {
825  std::string id;
826  std::string lowercase_id;
828 
829  if(c == '%')
830  {
832  c = getc();
833  }
834  else if(c == '@')
835  {
837  c = getc();
838  }
839 
840  if(c == '-')
841  {
842  id += "-";
843  lowercase_id += "-";
844  c = getc();
845  }
846 
847  if(c == '\\')
848  {
849  c = escape();
850  if(c != 0xFFFD)
851  {
852  id += wctomb(c);
853  lowercase_id += wctomb(std::tolower(c));
854  }
855  }
856  else if(is_start_identifier(c))
857  {
858  id += wctomb(c);
859  lowercase_id += wctomb(std::tolower(c));
860  }
861  else
862  {
863  if(type == node_type_t::AT_KEYWORD)
864  {
865  // (TBD: should '@' be returned by itself?)
866  ungetc(c);
867  error::instance() << f_start_position << "found an empty identifier." << error_mode_t::ERROR_ERROR;
869  }
870  // this should not happen because we do not call the identifier()
871  // function with such invalid non-sense
872  throw csspp_exception_logic("lexer::identifier() called with an invalid identifier start."); // LCOV_EXCL_LINE
873  }
874 
875  for(;;)
876  {
877  c = getc();
878  if(c == '\\')
879  {
880  c = escape();
881  if(c == 0xFFFD)
882  {
883  // this happens when a backslash is the very last character
884  // of an input file
885  break;
886  }
887  }
888  else if(!is_identifier(c))
889  {
890  break;
891  }
892  id += wctomb(c);
893  lowercase_id += wctomb(std::tolower(c));
894  }
895 
896  // this can happen if the '\' was followed by EOF
897  // note that the '@' followed by something else than a valid
898  // identifier start character is caught sooner (just before
899  // the throw a couple of blocks up)
900  if(id.empty())
901  {
902  // well... that was an "empty" token, so ignore and return EOF instead
903  ungetc(c);
904  error::instance() << f_start_position << "found an empty identifier." << error_mode_t::ERROR_ERROR;
906  }
907 
908  if(c == '(' && type != node_type_t::AT_KEYWORD)
909  {
910  if(lowercase_id == "url")
911  {
912  // very special case of a URL
913  // (this is nearly like a function except that the parameter
914  // does not need to be a string even though it should be)
915  do
916  {
917  // skip all whitespaces
918  c = getc();
919  }
920  while(is_space(c));
921  std::string url;
922  if(c == '"' || c == '\'')
923  {
924  // 'c' represents the quote character
925  url = string(c);
926  }
927  else
928  {
929  // no quotes, read data up to the next ')'
930  // generate an error on any unexpected character
931  url += wctomb(c);
932  for(;;)
933  {
934  c = getc();
935  if(c == ')'
936  || is_space(c))
937  {
938  break;
939  }
940  if(c == EOF
941  || c == '"'
942  || c == '\''
943  || c == '('
944  || is_non_printable(c))
945  {
946  error::instance() << f_start_position << "found an invalid URL, one with forbidden characters." << error_mode_t::ERROR_ERROR;
947  c = ')'; // simulate us ending cleanly to avoid a double error
948  break;
949  }
950 
951  url += wctomb(c);
952  }
953  }
954 
955  // got the ')' yet?
956  if(c != ')')
957  {
958  for(;;)
959  {
960  c = getc();
961  if(c == ')')
962  {
963  break;
964  }
965  if(!is_space(c))
966  {
967  error::instance() << f_start_position << "found an invalid URL, one which includes spaces or has a missing ')'." << error_mode_t::ERROR_ERROR;
968  // TODO: determine whether we should break
969  // or skip until we find a parenthesis
970  // we may also want to check the character
971  // (i.e. skip up to ')' or ';', '\n' etc.)
972  break;
973  }
974  // skip trailing spaces
975  }
976  }
977 
979  n->set_string(url);
980  return n;
981  }
982  else
983  {
984  // special case of a function
986  // functions are always considered case insensitive
987  // (although some Microsoft old extensions were case sensitive...)
988  n->set_string(lowercase_id);
989  return n;
990  }
991  }
992 
993  ungetc(c);
994 
995  // we got an identifier
996  node::pointer_t n(new node(type, f_start_position));
997  n->set_string(id);
998  n->set_lowercase_string(lowercase_id);
999  return n;
1000 }
1001 
1003 {
1004  bool const has_sign(c == '-' || c == '+');
1005  int const sign(c == '-' ? -1 : 1);
1006  if(has_sign)
1007  {
1008  // skip the sign if we have one
1009  c = getc();
1010  }
1011 
1012  // the first part is an integer number
1013  integer_t integer(0);
1014  if(is_digit(c))
1015  {
1016  // number before the period ("integer")
1017  integer = c - '0';
1018  for(;;)
1019  {
1020  c = getc();
1021  if(!is_digit(c))
1022  {
1023  break;
1024  }
1025  uint64_t ni(static_cast<uint64_t>(integer) * 10 + c - '0');
1026  if(ni >= 0x8000000000000000LL)
1027  {
1028  // we accept all up to the time it goes negative
1029  error::instance() << f_start_position << "integral part too large for a number." << error_mode_t::ERROR_ERROR;
1030  }
1031  integer = static_cast<integer_t>(ni);
1032  }
1033  }
1034 
1035  // we can have a decimal part
1036  decimal_number_t decimal_part(0);
1037  decimal_number_t decimal_frac(1.0);
1038  if(c == '.')
1039  {
1040  for(;;)
1041  {
1042  c = getc();
1043  if(!is_digit(c))
1044  {
1045  break;
1046  }
1047  decimal_frac *= 10.0;
1048  decimal_part += (c - '0') / decimal_frac;
1049  if(decimal_frac >= 1e21 && decimal_frac < 1e22)
1050  {
1051  error::instance() << f_start_position << "fraction too large for a decimal number." << error_mode_t::ERROR_ERROR;
1052  }
1053  }
1054 #pragma GCC diagnostic push
1055 #pragma GCC diagnostic ignored "-Wfloat-equal"
1056  if(decimal_frac == 1.0)
1057 #pragma GCC diagnostic pop
1058  {
1059  // TBD: I do not think that a number can be followed by a class
1060  // so I do not think this error is a problem
1061  // 35.my-class
1062  error::instance() << f_start_position << "decimal number must have at least one digit after the decimal point." << error_mode_t::ERROR_ERROR;
1063  // this won't affect the resulting value, however it will
1064  // mark the number as a decimal number instead of an integer
1065  decimal_frac = 10.0;
1066  }
1067  }
1068 
1069  integer_t exponent(0);
1070  if(c == 'e' || c == 'E')
1071  {
1072  // we have to make sure this looks like an exponent otherwise
1073  // we are likely to break a dimension such as "em"
1074  bool is_exponent(false);
1075  wide_char_t const s(getc());
1076  if(s == '-' || s == '+')
1077  {
1078  wide_char_t const d(getc());
1079  if(is_digit(d))
1080  {
1081  is_exponent = true;
1082  }
1083  ungetc(d);
1084  }
1085  else if(is_digit(s))
1086  {
1087  is_exponent = true;
1088  }
1089  ungetc(s);
1090  if(is_exponent)
1091  {
1092  c = getc();
1093  integer_t exponent_sign(1);
1094  if(c == '-')
1095  {
1096  exponent_sign = -1;
1097  c = getc();
1098  }
1099  else if(c == '+')
1100  {
1101  c = getc();
1102  }
1103  if(!is_digit(c))
1104  {
1105  // see definition of is_exponent to understand why this is throw
1106  throw csspp_exception_logic("we just checked that there would be a digit here, optionally preceeded by a sign."); // LCOV_EXCL_LINE
1107  }
1108  for(; is_digit(c); c = getc())
1109  {
1110  exponent = exponent * 10 + c - '0';
1111  if(exponent >= 1024)
1112  {
1113  error::instance() << f_start_position << "exponent too large for a decimal number." << error_mode_t::ERROR_ERROR;
1114  }
1115  }
1116  exponent *= exponent_sign;
1117  }
1118  }
1119 
1120  // dimension is empty by default (i.e. we are just dealing with a number)
1121  // if not empty, then the DECIMAL_NUMBER and INTEGER are dimensions
1122  std::string dimension;
1123  if(is_identifier(c)
1124  || c == '\\')
1125  {
1126  // unfortunately, calling the identifier() function would
1127  // (1) force the dimension to start with a start identifier
1128  // character; (2) create an unnecessary node; so instead we
1129  // duplicate the inner loop here
1130  for(;;)
1131  {
1132  if(c == '\\')
1133  {
1134  c = escape();
1135  if(c == 0xFFFD)
1136  {
1137  // this happens when a backslash is the very last character
1138  // of an input file
1139  break;
1140  }
1141  }
1142  else if(!is_identifier(c))
1143  {
1144  ungetc(c);
1145  c = '\0'; // make sure it is not %
1146  break;
1147  }
1148  dimension += wctomb(std::tolower(c));
1149  c = getc();
1150  }
1151  // if the dimension is just "-" then it is wrong
1152  if(dimension == "-")
1153  {
1154  ungetc('-');
1155  dimension = "";
1156  }
1157  }
1158  else if(c == '%')
1159  {
1160 #pragma GCC diagnostic push
1161 #pragma GCC diagnostic ignored "-Wfloat-equal"
1162  if(decimal_frac == 1.0)
1163 #pragma GCC diagnostic pop
1164  {
1165  decimal_frac = 10.0;
1166  }
1167  }
1168  else
1169  {
1170  ungetc(c);
1171  }
1172 
1173  node::pointer_t n;
1174 
1175 #pragma GCC diagnostic push
1176 #pragma GCC diagnostic ignored "-Wfloat-equal"
1177  if(exponent != 0
1178  || decimal_frac != 1.0)
1179 #pragma GCC diagnostic pop
1180  {
1182  // Note: CSS defines this math as such and thus we follow that scheme
1183  // instead of the usual immediate conversion
1184  //
1185  // TODO: We may want to check/know about gross overflows?
1186  //
1187 //std::cerr << "+++ integer = [" << integer << "]\n"
1188 // << "+++ decimal_part = [" << decimal_part << "] / [" << decimal_frac << "]\n"
1189 // << "+++ exponent = [" << exponent << "]\n";
1190  n->set_decimal_number(sign * (static_cast<decimal_number_t>(integer) + decimal_part)
1191  * pow(10.0, static_cast<decimal_number_t>(exponent)));
1192  if(c == '%')
1193  {
1194  // a percent value is generally from 0.0 to 1.0, so convert it now
1195  n->set_decimal_number(n->get_decimal_number() / 100.0);
1196  }
1197  else
1198  {
1199  n->set_string(dimension);
1200  }
1201  }
1202  else
1203  {
1205  n->set_integer(integer * sign);
1206  n->set_string(dimension);
1207  }
1208  n->set_boolean(has_sign);
1209  return n;
1210 }
1211 
1213 {
1214  std::string str;
1215  for(;;)
1216  {
1217  wide_char_t c(getc());
1218  if(c == '\\')
1219  {
1220  c = escape();
1221  if(c == 0xFFFD)
1222  {
1223  break;
1224  }
1225  }
1226  else if(!is_hash_character(c))
1227  {
1228  ungetc(c);
1229  break;
1230  }
1231  str += wctomb(c);
1232  }
1233 
1234  if(str.empty())
1235  {
1236  error::instance() << f_start_position << "'#' by itself is not valid." << error_mode_t::ERROR_ERROR;
1237  return node::pointer_t();
1238  }
1239 
1241  n->set_string(str);
1242  return n;
1243 }
1244 
1245 std::string lexer::string(wide_char_t const quote)
1246 {
1247  std::string str;
1248  for(;;)
1249  {
1250  wide_char_t c(getc());
1251  if(c == EOF)
1252  {
1253  // In CSS this is not considered an error, it very much is for us
1254  // (optimization of that kind is not allowed in our sources)
1255  error::instance() << f_start_position << "found an unterminated string." << error_mode_t::ERROR_ERROR;
1256  return str;
1257  }
1258  if(c == '\n')
1259  {
1260  // remember that whitespaces are significant in CSS
1261  ungetc(c);
1262  error::instance() << f_start_position << "found an unterminated string with an unescaped newline." << error_mode_t::ERROR_ERROR;
1263  return str;
1264  }
1265  if(c == quote)
1266  {
1267  return str;
1268  }
1269  if(c == '\\')
1270  {
1271  // escape
1272  wide_char_t n(getc());
1273  if(n == '\n')
1274  {
1275  c = '\n';
1276  }
1277  else if(n == EOF)
1278  {
1279  c = EOF;
1280  }
1281  else if(n == 0xFFFD)
1282  {
1283  // We have a special case here because ungetc(0xFFFD) does
1284  // nothing so we would not otherwise catch this error!
1285  error::instance() << f_start_position << "invalid character after a \\ character." << error_mode_t::ERROR_ERROR;
1286  c = EOF; // do not insert anything more in the string for this entry
1287  }
1288  else
1289  {
1290  ungetc(n);
1291  c = escape();
1292  }
1293  }
1294 
1295  if(c != EOF
1296  && c != 0xFFFD)
1297  {
1298  str += wctomb(c);
1299  }
1300  }
1301  //NOTREACHED
1302 }
1303 
1305 {
1306  std::string str;
1307 
1308  if(c_comment)
1309  {
1310  // skip leading spaces
1311  for(;;)
1312  {
1313  wide_char_t const c(getc());
1314  if(!is_space(c))
1315  {
1316  ungetc(c);
1317  break;
1318  }
1319  }
1320 
1321  // read up to the next "*/" sequence
1322  for(;;)
1323  {
1324  wide_char_t c(getc());
1325  if(c == EOF)
1326  {
1327  error::instance() << f_start_position << "unclosed C-like comment at the end of your document." << error_mode_t::ERROR_ERROR;
1328  break;
1329  }
1330  if(c == '*')
1331  {
1332  c = getc();
1333  if(c == '/')
1334  {
1335  break;
1336  }
1337  ungetc(c);
1338  c = '*';
1339  }
1340  //else if(c == '\n') ... remove the starting '*' or ' *'?
1341  str += wctomb(c);
1342  }
1343  }
1344  else
1345  {
1346  // skip leading spaces, but not newlines!
1347  for(;;)
1348  {
1349  wide_char_t const c(getc());
1350  if(c != ' '
1351  && c != '\t')
1352  {
1353  ungetc(c);
1354  break;
1355  }
1356  }
1357 
1358  // read up to the next "\n" character, however, we also
1359  // save the following lines if these also are C++ like
1360  // comments because it certainly represents one block
1361  for(;;)
1362  {
1363  wide_char_t c(getc());
1364  if(c == EOF)
1365  {
1366  break;
1367  }
1368  if(c == '\n')
1369  {
1370  c = getc();
1371  if(c == '/')
1372  {
1373  c = getc();
1374  if(c == '/')
1375  {
1376  // include a newline, but not the "//" sequence
1377  str += '\n';
1378  // remove the first space if there is such
1379  // it will be readded by the assembler
1380  c = getc();
1381  if(c != ' '
1382  && c != '\t')
1383  {
1384  ungetc(c);
1385  }
1386  continue;
1387  }
1388  ungetc(c);
1389  c = '/';
1390  }
1391  ungetc(c);
1392 
1393  // whitespaces can be significant in CSS, we want the '\n'
1394  // to generate one here too
1395  ungetc('\n');
1396  break;
1397  }
1398  str += wctomb(c);
1399  }
1400  }
1401 
1402  //
1403  // comments are kept only if marked with the special @-keyword:
1404  // @preserve
1405  //
1406  if(str.find("@preserve") != std::string::npos)
1407  {
1408  // remove ending spaces
1409  while(!str.empty() && is_space(str.back()))
1410  {
1411  str.pop_back();
1412  }
1413 
1415  n->set_string(str);
1416  n->set_integer(c_comment ? 1 : 0); // make sure to keep the type of comment
1417  return n;
1418  }
1419 
1420  return node::pointer_t();
1421 }
1422 
1424 {
1425  // U+ was skipped in the next_token() function
1426  // 'd' represents the first digit on entry
1427  wide_char_t start(0);
1428  wide_char_t end(0);
1429  bool has_mask(false);
1430  for(int count(0);
1431  count < 6 && ((is_hex(d) && !has_mask) || d == '?');
1432  ++count, d = getc())
1433  {
1434  if(d == '?')
1435  {
1436  if(!has_mask)
1437  {
1438  end = start;
1439  }
1440  has_mask = true;
1441  start *= 16;
1442  end = end * 16 + 15;
1443  }
1444  else
1445  {
1446  start = start * 16 + hex_to_dec(d);
1447  }
1448  }
1449 
1450  // if no mask (? chars) then we may have a dash (-) and a specific end
1451  if(has_mask)
1452  {
1453  if(start >= 0x110000)
1454  {
1455  error::instance() << f_start_position << "unicode character too large, range is U+000000 to U+10FFFF." << error_mode_t::ERROR_ERROR;
1456  start = 0; // avoid a double error with start > end
1457  }
1458  // the end of a unicode range may include values that are not
1459  // representing valid Unicode characters; but we have to support
1460  // such to accept all possible masks (i.e. 1?????)
1461  if(end > 0x1FFFFF)
1462  {
1463  // this can legally happen when using a mask such as "1?????"
1464  end = 0x1FFFFF;
1465  }
1466  }
1467  else
1468  {
1469  if(d == '-')
1470  {
1471  // skip the '-'
1472  d = getc();
1473 
1474  // in this case the '?' are not allowed
1475  for(int count(0); count < 6 && is_hex(d); ++count, d = getc())
1476  {
1477  end = end * 16 + hex_to_dec(d);
1478  }
1479  }
1480  else
1481  {
1482  // not specified, same as start
1483  end = start;
1484  }
1485 
1486  if(start >= 0x110000
1487  || end >= 0x110000)
1488  {
1489  error::instance() << f_start_position << "unicode character too large, range is U+000000 to U+10FFFF." << error_mode_t::ERROR_ERROR;
1491  return n;
1492  }
1493  }
1494 
1495  if(start > end)
1496  {
1497  error::instance() << f_start_position << "unicode range cannot have a start character larger than the end character." << error_mode_t::ERROR_ERROR;
1499  return n;
1500  }
1501 
1502  // whatever character ended the range is pushed back
1503  ungetc(d);
1504 
1506  unicode_range_t range(start, end);
1507  n->set_integer(range.get_range());
1508  return n;
1509 }
1510 
1512 {
1513  std::string var;
1514 
1515  for(;;)
1516  {
1517  // SASS accepts '-' and '_' as the same character;
1518  // we suggest you use the underscore to be more compatible with
1519  // other languages that do not support a '-' in variable names
1520  if(c == '-')
1521  {
1522  c = '_';
1523  }
1524  var += wctomb(std::tolower(c));
1525  c = getc();
1526  if(!is_variable(c))
1527  {
1528  break;
1529  }
1530  }
1531 
1532  node::pointer_t n;
1533 
1534  if(c == '(')
1535  {
1536  // in this case we have a function call
1537  // functions can be defined using @mixin func(...) { ... }
1539  }
1540  else
1541  {
1542  ungetc(c);
1543 
1545  }
1546 
1547  // we got a variable
1548  n->set_string(var);
1549  return n;
1550 }
1551 
1552 } // namespace csspp
1553 
1554 // Local Variables:
1555 // mode: cpp
1556 // indent-tabs-mode: nil
1557 // c-basic-offset: 4
1558 // tab-width: 4
1559 // End:
1560 
1561 // vim: ts=4 sw=4 et
int32_t wide_char_t
Definition: csspp.h:49
static bool constexpr is_non_printable(wide_char_t c)
Definition: lexer.h:46
static bool constexpr is_digit(wide_char_t c)
Definition: lexer.h:86
void next_line()
Definition: position.cpp:44
wide_char_t getc()
Definition: lexer.cpp:618
static bool constexpr is_hex(wide_char_t c)
Definition: lexer.h:91
static bool constexpr is_space(wide_char_t c)
Definition: lexer.h:38
std::shared_ptr< node > pointer_t
Definition: node.h:122
node::pointer_t hash()
Definition: lexer.cpp:1212
wide_char_t escape()
Definition: lexer.cpp:763
node::pointer_t number(wide_char_t c)
Definition: lexer.cpp:1002
int64_t integer_t
Definition: csspp.h:52
range_value_t get_range() const
static bool constexpr is_hash_character(wide_char_t c)
Definition: lexer.h:98
std::string string(wide_char_t const quote)
Definition: lexer.cpp:1245
position f_position
Definition: lexer.h:126
node_type_t
Definition: node.h:36
node::pointer_t next_token()
Definition: lexer.cpp:60
void ungetc(wide_char_t c)
Definition: lexer.cpp:716
node::pointer_t identifier(wide_char_t c)
Definition: lexer.cpp:823
void wctomb(wide_char_t const wc, char *mb, size_t max_length)
Definition: lexer.cpp:548
node::pointer_t variable(wide_char_t c)
Definition: lexer.cpp:1511
static bool constexpr is_variable(wide_char_t c)
Definition: lexer.h:56
std::istream & f_in
Definition: lexer.h:125
static bool constexpr is_start_identifier(wide_char_t c)
Definition: lexer.h:77
wide_char_t mbtowc(char const *mb)
Definition: lexer.cpp:487
double decimal_number_t
Definition: csspp.h:53
node::pointer_t unicode_range(wide_char_t c)
Definition: lexer.cpp:1423
position f_start_position
Definition: lexer.h:127
static error & instance()
Definition: error.cpp:78
static int hex_to_dec(wide_char_t c)
Definition: lexer.cpp:744
lexer(std::istream &in, position const &pos)
Definition: lexer.cpp:53
wide_char_t f_ungetc[UNGETSIZ]
Definition: lexer.h:128
size_t f_ungetc_pos
Definition: lexer.h:129
static bool constexpr is_identifier(wide_char_t c)
Definition: lexer.h:66
node::pointer_t comment(bool c_comment)
Definition: lexer.cpp:1304
void next_page()
Definition: position.cpp:50

Documentation of CSS Preprocessor.

This document is part of the Snap! Websites Project.

Copyright by Made to Order Software Corp.