1/*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26#include "md4c.h"
27
28#include <limits.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <string.h>
32
33
34/*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47#endif
48
49/* Make the UTF-8 support the default. */
50#if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52#endif
53
54/* Magic for making wide literals with MD4C_USE_UTF16. */
55#ifdef _T
56 #undef _T
57#endif
58#if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60#else
61 #define _T(x) x
62#endif
63
64/* Misc. macros. */
65#define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67#define STRINGIZE_(x) #x
68#define STRINGIZE(x) STRINGIZE_(x)
69
70#ifndef TRUE
71 #define TRUE 1
72 #define FALSE 0
73#endif
74
75
76/************************
77 *** Internal Types ***
78 ************************/
79
80/* These are omnipresent so lets save some typing. */
81#define CHAR MD_CHAR
82#define SZ MD_SIZE
83#define OFF MD_OFFSET
84
85typedef struct MD_MARK_tag MD_MARK;
86typedef struct MD_BLOCK_tag MD_BLOCK;
87typedef struct MD_CONTAINER_tag MD_CONTAINER;
88typedef struct MD_REF_DEF_tag MD_REF_DEF;
89
90
91/* During analyzes of inline marks, we need to manage some "mark chains",
92 * of (yet unresolved) openers. This structure holds start/end of the chain.
93 * The chain internals are then realized through MD_MARK::prev and ::next.
94 */
95typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
96struct MD_MARKCHAIN_tag {
97 int head; /* Index of first mark in the chain, or -1 if empty. */
98 int tail; /* Index of last mark in the chain, or -1 if empty. */
99};
100
101/* Context propagated through all the parsing. */
102typedef struct MD_CTX_tag MD_CTX;
103struct MD_CTX_tag {
104 /* Immutable stuff (parameters of md_parse()). */
105 const CHAR* text;
106 SZ size;
107 MD_PARSER parser;
108 void* userdata;
109
110 /* When this is true, it allows some optimizations. */
111 int doc_ends_with_newline;
112
113 /* Helper temporary growing buffer. */
114 CHAR* buffer;
115 unsigned alloc_buffer;
116
117 /* Reference definitions. */
118 MD_REF_DEF* ref_defs;
119 int n_ref_defs;
120 int alloc_ref_defs;
121 void** ref_def_hashtable;
122 int ref_def_hashtable_size;
123
124 /* Stack of inline/span markers.
125 * This is only used for parsing a single block contents but by storing it
126 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
127 * (re)allocations. */
128 MD_MARK* marks;
129 int n_marks;
130 int alloc_marks;
131
132#if defined MD4C_USE_UTF16
133 char mark_char_map[128];
134#else
135 char mark_char_map[256];
136#endif
137
138 /* For resolving of inline spans. */
139 MD_MARKCHAIN mark_chains[13];
140#define PTR_CHAIN (ctx->mark_chains[0])
141#define TABLECELLBOUNDARIES (ctx->mark_chains[1])
142#define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2])
143#define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3])
144#define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4])
145#define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5])
146#define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6])
147#define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7])
148#define UNDERSCORE_OPENERS (ctx->mark_chains[8])
149#define TILDE_OPENERS_1 (ctx->mark_chains[9])
150#define TILDE_OPENERS_2 (ctx->mark_chains[10])
151#define BRACKET_OPENERS (ctx->mark_chains[11])
152#define DOLLAR_OPENERS (ctx->mark_chains[12])
153#define OPENERS_CHAIN_FIRST 2
154#define OPENERS_CHAIN_LAST 12
155
156 int n_table_cell_boundaries;
157
158 /* For resolving links. */
159 int unresolved_link_head;
160 int unresolved_link_tail;
161
162 /* For resolving raw HTML. */
163 OFF html_comment_horizon;
164 OFF html_proc_instr_horizon;
165 OFF html_decl_horizon;
166 OFF html_cdata_horizon;
167
168 /* For block analysis.
169 * Notes:
170 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
171 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
172 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
173 * instead of MD_LINE(s).
174 */
175 void* block_bytes;
176 MD_BLOCK* current_block;
177 int n_block_bytes;
178 int alloc_block_bytes;
179
180 /* For container block analysis. */
181 MD_CONTAINER* containers;
182 int n_containers;
183 int alloc_containers;
184
185 /* Minimal indentation to call the block "indented code block". */
186 unsigned code_indent_offset;
187
188 /* Contextual info for line analysis. */
189 SZ code_fence_length; /* For checking closing fence length. */
190 int html_block_type; /* For checking closing raw HTML condition. */
191 int last_line_has_list_loosening_effect;
192 int last_list_item_starts_with_two_blank_lines;
193};
194
195enum MD_LINETYPE_tag {
196 MD_LINE_BLANK,
197 MD_LINE_HR,
198 MD_LINE_ATXHEADER,
199 MD_LINE_SETEXTHEADER,
200 MD_LINE_SETEXTUNDERLINE,
201 MD_LINE_INDENTEDCODE,
202 MD_LINE_FENCEDCODE,
203 MD_LINE_HTML,
204 MD_LINE_TEXT,
205 MD_LINE_TABLE,
206 MD_LINE_TABLEUNDERLINE
207};
208typedef enum MD_LINETYPE_tag MD_LINETYPE;
209
210typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
211struct MD_LINE_ANALYSIS_tag {
212 MD_LINETYPE type : 16;
213 unsigned data : 16;
214 OFF beg;
215 OFF end;
216 unsigned indent; /* Indentation level. */
217};
218
219typedef struct MD_LINE_tag MD_LINE;
220struct MD_LINE_tag {
221 OFF beg;
222 OFF end;
223};
224
225typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
226struct MD_VERBATIMLINE_tag {
227 OFF beg;
228 OFF end;
229 OFF indent;
230};
231
232
233/*******************
234 *** Debugging ***
235 *******************/
236
237#define MD_LOG(msg) \
238 do { \
239 if(ctx->parser.debug_log != NULL) \
240 ctx->parser.debug_log((msg), ctx->userdata); \
241 } while(0)
242
243#ifdef DEBUG
244 #define MD_ASSERT(cond) \
245 do { \
246 if(!(cond)) { \
247 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
248 "Assertion '" STRINGIZE(cond) "' failed."); \
249 exit(1); \
250 } \
251 } while(0)
252
253 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
254#else
255 #ifdef __GNUC__
256 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
257 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
258 #elif defined _MSC_VER && _MSC_VER > 120
259 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
260 #define MD_UNREACHABLE() do { __assume(0); } while(0)
261 #else
262 #define MD_ASSERT(cond) do {} while(0)
263 #define MD_UNREACHABLE() do {} while(0)
264 #endif
265#endif
266
267
268/*****************
269 *** Helpers ***
270 *****************/
271
272/* Character accessors. */
273#define CH(off) (ctx->text[(off)])
274#define STR(off) (ctx->text + (off))
275
276/* Character classification.
277 * Note we assume ASCII compatibility of code points < 128 here. */
278#define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
279#define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL)
280#define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
281#define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
282#define ISASCII_(ch) ((unsigned)(ch) <= 127)
283#define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
284#define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
285#define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
286#define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
287#define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
288#define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
289#define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
290#define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
291#define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
292#define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
293#define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
294
295#define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
296#define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
297#define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
298#define ISASCII(off) ISASCII_(CH(off))
299#define ISBLANK(off) ISBLANK_(CH(off))
300#define ISNEWLINE(off) ISNEWLINE_(CH(off))
301#define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
302#define ISCNTRL(off) ISCNTRL_(CH(off))
303#define ISPUNCT(off) ISPUNCT_(CH(off))
304#define ISUPPER(off) ISUPPER_(CH(off))
305#define ISLOWER(off) ISLOWER_(CH(off))
306#define ISALPHA(off) ISALPHA_(CH(off))
307#define ISDIGIT(off) ISDIGIT_(CH(off))
308#define ISXDIGIT(off) ISXDIGIT_(CH(off))
309#define ISALNUM(off) ISALNUM_(CH(off))
310
311
312#if defined MD4C_USE_UTF16
313 #define md_strchr wcschr
314#else
315 #define md_strchr strchr
316#endif
317
318
319/* Case insensitive check of string equality. */
320static inline int
321md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
322{
323 OFF i;
324 for(i = 0; i < n; i++) {
325 CHAR ch1 = s1[i];
326 CHAR ch2 = s2[i];
327
328 if(ISLOWER_(ch1))
329 ch1 += ('A'-'a');
330 if(ISLOWER_(ch2))
331 ch2 += ('A'-'a');
332 if(ch1 != ch2)
333 return FALSE;
334 }
335 return TRUE;
336}
337
338static inline int
339md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
340{
341 return memcmp(s1: s1, s2: s2, n: n * sizeof(CHAR)) == 0;
342}
343
344static int
345md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
346{
347 OFF off = 0;
348 int ret = 0;
349
350 while(1) {
351 while(off < size && str[off] != _T('\0'))
352 off++;
353
354 if(off > 0) {
355 ret = ctx->parser.text(type, str, off, ctx->userdata);
356 if(ret != 0)
357 return ret;
358
359 str += off;
360 size -= off;
361 off = 0;
362 }
363
364 if(off >= size)
365 return 0;
366
367 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
368 if(ret != 0)
369 return ret;
370 off++;
371 }
372}
373
374
375#define MD_CHECK(func) \
376 do { \
377 ret = (func); \
378 if(ret < 0) \
379 goto abort; \
380 } while(0)
381
382
383#define MD_TEMP_BUFFER(sz) \
384 do { \
385 if(sz > ctx->alloc_buffer) { \
386 CHAR* new_buffer; \
387 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
388 \
389 new_buffer = realloc(ctx->buffer, new_size); \
390 if(new_buffer == NULL) { \
391 MD_LOG("realloc() failed."); \
392 ret = -1; \
393 goto abort; \
394 } \
395 \
396 ctx->buffer = new_buffer; \
397 ctx->alloc_buffer = new_size; \
398 } \
399 } while(0)
400
401
402#define MD_ENTER_BLOCK(type, arg) \
403 do { \
404 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
405 if(ret != 0) { \
406 MD_LOG("Aborted from enter_block() callback."); \
407 goto abort; \
408 } \
409 } while(0)
410
411#define MD_LEAVE_BLOCK(type, arg) \
412 do { \
413 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
414 if(ret != 0) { \
415 MD_LOG("Aborted from leave_block() callback."); \
416 goto abort; \
417 } \
418 } while(0)
419
420#define MD_ENTER_SPAN(type, arg) \
421 do { \
422 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
423 if(ret != 0) { \
424 MD_LOG("Aborted from enter_span() callback."); \
425 goto abort; \
426 } \
427 } while(0)
428
429#define MD_LEAVE_SPAN(type, arg) \
430 do { \
431 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
432 if(ret != 0) { \
433 MD_LOG("Aborted from leave_span() callback."); \
434 goto abort; \
435 } \
436 } while(0)
437
438#define MD_TEXT(type, str, size) \
439 do { \
440 if(size > 0) { \
441 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
442 if(ret != 0) { \
443 MD_LOG("Aborted from text() callback."); \
444 goto abort; \
445 } \
446 } \
447 } while(0)
448
449#define MD_TEXT_INSECURE(type, str, size) \
450 do { \
451 if(size > 0) { \
452 ret = md_text_with_null_replacement(ctx, type, str, size); \
453 if(ret != 0) { \
454 MD_LOG("Aborted from text() callback."); \
455 goto abort; \
456 } \
457 } \
458 } while(0)
459
460
461
462/*************************
463 *** Unicode Support ***
464 *************************/
465
466typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
467struct MD_UNICODE_FOLD_INFO_tag {
468 unsigned codepoints[3];
469 int n_codepoints;
470};
471
472
473#if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
474 /* Binary search over sorted "map" of codepoints. Consecutive sequences
475 * of codepoints may be encoded in the map by just using the
476 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
477 *
478 * Returns index of the found record in the map (in the case of ranges,
479 * the minimal value is used); or -1 on failure. */
480 static int
481 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
482 {
483 int beg, end;
484 int pivot_beg, pivot_end;
485
486 beg = 0;
487 end = (int) map_size-1;
488 while(beg <= end) {
489 /* Pivot may be a range, not just a single value. */
490 pivot_beg = pivot_end = (beg + end) / 2;
491 if(map[pivot_end] & 0x40000000)
492 pivot_end++;
493 if(map[pivot_beg] & 0x80000000)
494 pivot_beg--;
495
496 if(codepoint < (map[pivot_beg] & 0x00ffffff))
497 end = pivot_beg - 1;
498 else if(codepoint > (map[pivot_end] & 0x00ffffff))
499 beg = pivot_end + 1;
500 else
501 return pivot_beg;
502 }
503
504 return -1;
505 }
506
507 static int
508 md_is_unicode_whitespace__(unsigned codepoint)
509 {
510#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
511#define S(cp) (cp)
512 /* Unicode "Zs" category.
513 * (generated by scripts/build_whitespace_map.py) */
514 static const unsigned WHITESPACE_MAP[] = {
515 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
516 };
517#undef R
518#undef S
519
520 /* The ASCII ones are the most frequently used ones, also CommonMark
521 * specification requests few more in this range. */
522 if(codepoint <= 0x7f)
523 return ISWHITESPACE_(codepoint);
524
525 return (md_unicode_bsearch__(codepoint, map: WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
526 }
527
528 static int
529 md_is_unicode_punct__(unsigned codepoint)
530 {
531#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
532#define S(cp) (cp)
533 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
534 * (generated by scripts/build_punct_map.py) */
535 static const unsigned PUNCT_MAP[] = {
536 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
537 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
538 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
539 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
540 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
541 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
542 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
543 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
544 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
545 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
546 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
547 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
548 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
549 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
550 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
551 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
552 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
553 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
554 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
555 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
556 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
557 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
558 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
559 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
560 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
561 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
562 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
563 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
564 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
565 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
566 };
567#undef R
568#undef S
569
570 /* The ASCII ones are the most frequently used ones, also CommonMark
571 * specification requests few more in this range. */
572 if(codepoint <= 0x7f)
573 return ISPUNCT_(codepoint);
574
575 return (md_unicode_bsearch__(codepoint, map: PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
576 }
577
578 static void
579 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
580 {
581#define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
582#define S(cp) (cp)
583 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
584 * (generated by scripts/build_folding_map.py) */
585 static const unsigned FOLD_MAP_1[] = {
586 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
587 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
588 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
589 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
590 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
591 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
592 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
593 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
594 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
595 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
596 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
597 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
598 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
599 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
600 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
601 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
602 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
603 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
604 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
605 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
606 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
607 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
608 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
609 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
610 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
611 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
612 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
613 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
614 };
615 static const unsigned FOLD_MAP_1_DATA[] = {
616 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
617 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
618 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
619 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
620 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
621 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
622 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
623 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
624 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
625 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
626 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
627 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
628 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
629 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
630 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
631 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
632 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
633 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
634 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
635 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
636 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
637 0x1e943
638 };
639 static const unsigned FOLD_MAP_2[] = {
640 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
641 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
642 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
643 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
644 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
645 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
646 };
647 static const unsigned FOLD_MAP_2_DATA[] = {
648 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
649 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
650 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
651 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
652 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
653 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
654 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
655 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
656 };
657 static const unsigned FOLD_MAP_3[] = {
658 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
659 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
660 };
661 static const unsigned FOLD_MAP_3_DATA[] = {
662 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
663 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
664 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
665 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
666 };
667#undef R
668#undef S
669 static const struct {
670 const unsigned* map;
671 const unsigned* data;
672 size_t map_size;
673 int n_codepoints;
674 } FOLD_MAP_LIST[] = {
675 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
676 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
677 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
678 };
679
680 int i;
681
682 /* Fast path for ASCII characters. */
683 if(codepoint <= 0x7f) {
684 info->codepoints[0] = codepoint;
685 if(ISUPPER_(codepoint))
686 info->codepoints[0] += 'a' - 'A';
687 info->n_codepoints = 1;
688 return;
689 }
690
691 /* Try to locate the codepoint in any of the maps. */
692 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
693 int index;
694
695 index = md_unicode_bsearch__(codepoint, map: FOLD_MAP_LIST[i].map, map_size: FOLD_MAP_LIST[i].map_size);
696 if(index >= 0) {
697 /* Found the mapping. */
698 int n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
699 const unsigned* map = FOLD_MAP_LIST[i].map;
700 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
701
702 memcpy(dest: info->codepoints, src: codepoints, n: sizeof(unsigned) * n_codepoints);
703 info->n_codepoints = n_codepoints;
704
705 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
706 /* The found mapping maps whole range of codepoints,
707 * i.e. we have to offset info->codepoints[0] accordingly. */
708 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
709 /* Alternating type of the range. */
710 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
711 } else {
712 /* Range to range kind of mapping. */
713 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
714 }
715 }
716
717 return;
718 }
719 }
720
721 /* No mapping found. Map the codepoint to itself. */
722 info->codepoints[0] = codepoint;
723 info->n_codepoints = 1;
724 }
725#endif
726
727
728#if defined MD4C_USE_UTF16
729 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
730 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
731 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
732
733 static unsigned
734 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
735 {
736 if(IS_UTF16_SURROGATE_HI(str[0])) {
737 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
738 if(p_size != NULL)
739 *p_size = 2;
740 return UTF16_DECODE_SURROGATE(str[0], str[1]);
741 }
742 }
743
744 if(p_size != NULL)
745 *p_size = 1;
746 return str[0];
747 }
748
749 static unsigned
750 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
751 {
752 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
753 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
754
755 return CH(off);
756 }
757
758 /* No whitespace uses surrogates, so no decoding needed here. */
759 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
760 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
761 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
762
763 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
764 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
765
766 static inline int
767 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
768 {
769 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
770 }
771#elif defined MD4C_USE_UTF8
772 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
773 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
774 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
775 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
776 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
777
778 static unsigned
779 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
780 {
781 if(!IS_UTF8_LEAD1(str[0])) {
782 if(IS_UTF8_LEAD2(str[0])) {
783 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
784 if(p_size != NULL)
785 *p_size = 2;
786
787 return (((unsigned int)str[0] & 0x1f) << 6) |
788 (((unsigned int)str[1] & 0x3f) << 0);
789 }
790 } else if(IS_UTF8_LEAD3(str[0])) {
791 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
792 if(p_size != NULL)
793 *p_size = 3;
794
795 return (((unsigned int)str[0] & 0x0f) << 12) |
796 (((unsigned int)str[1] & 0x3f) << 6) |
797 (((unsigned int)str[2] & 0x3f) << 0);
798 }
799 } else if(IS_UTF8_LEAD4(str[0])) {
800 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
801 if(p_size != NULL)
802 *p_size = 4;
803
804 return (((unsigned int)str[0] & 0x07) << 18) |
805 (((unsigned int)str[1] & 0x3f) << 12) |
806 (((unsigned int)str[2] & 0x3f) << 6) |
807 (((unsigned int)str[3] & 0x3f) << 0);
808 }
809 }
810 }
811
812 if(p_size != NULL)
813 *p_size = 1;
814 return (unsigned) str[0];
815 }
816
817 static unsigned
818 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
819 {
820 if(!IS_UTF8_LEAD1(CH(off-1))) {
821 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
822 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
823 (((unsigned int)CH(off-1) & 0x3f) << 0);
824
825 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
826 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
827 (((unsigned int)CH(off-2) & 0x3f) << 6) |
828 (((unsigned int)CH(off-1) & 0x3f) << 0);
829
830 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
831 return (((unsigned int)CH(off-4) & 0x07) << 18) |
832 (((unsigned int)CH(off-3) & 0x3f) << 12) |
833 (((unsigned int)CH(off-2) & 0x3f) << 6) |
834 (((unsigned int)CH(off-1) & 0x3f) << 0);
835 }
836
837 return (unsigned) CH(off-1);
838 }
839
840 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
841 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
842 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
843
844 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
845 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
846
847 static inline unsigned
848 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
849 {
850 return md_decode_utf8__(str: str+off, str_size: str_size-off, p_size: p_char_size);
851 }
852#else
853 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
854 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
855 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
856
857 #define ISUNICODEPUNCT(off) ISPUNCT(off)
858 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
859
860 static inline void
861 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
862 {
863 info->codepoints[0] = codepoint;
864 if(ISUPPER_(codepoint))
865 info->codepoints[0] += 'a' - 'A';
866 info->n_codepoints = 1;
867 }
868
869 static inline unsigned
870 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
871 {
872 *p_size = 1;
873 return (unsigned) str[off];
874 }
875#endif
876
877
878/*************************************
879 *** Helper string manipulations ***
880 *************************************/
881
882/* Fill buffer with copy of the string between 'beg' and 'end' but replace any
883 * line breaks with given replacement character.
884 *
885 * NOTE: Caller is responsible to make sure the buffer is large enough.
886 * (Given the output is always shorter then input, (end - beg) is good idea
887 * what the caller should allocate.)
888 */
889static void
890md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
891 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
892{
893 CHAR* ptr = buffer;
894 int line_index = 0;
895 OFF off = beg;
896
897 while(1) {
898 const MD_LINE* line = &lines[line_index];
899 OFF line_end = line->end;
900 if(end < line_end)
901 line_end = end;
902
903 while(off < line_end) {
904 *ptr = CH(off);
905 ptr++;
906 off++;
907 }
908
909 if(off >= end) {
910 *p_size = ptr - buffer;
911 return;
912 }
913
914 *ptr = line_break_replacement_char;
915 ptr++;
916
917 line_index++;
918 off = lines[line_index].beg;
919 }
920}
921
922/* Wrapper of md_merge_lines() which allocates new buffer for the output string.
923 */
924static int
925md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
926 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
927{
928 CHAR* buffer;
929
930 buffer = (CHAR*) malloc(size: sizeof(CHAR) * (end - beg));
931 if(buffer == NULL) {
932 MD_LOG("malloc() failed.");
933 return -1;
934 }
935
936 md_merge_lines(ctx, beg, end, lines, n_lines,
937 line_break_replacement_char, buffer, p_size);
938
939 *p_str = buffer;
940 return 0;
941}
942
943static OFF
944md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
945{
946 SZ char_size;
947 unsigned codepoint;
948
949 while(off < size) {
950 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
951 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
952 break;
953 off += char_size;
954 }
955
956 return off;
957}
958
959
960/******************************
961 *** Recognizing raw HTML ***
962 ******************************/
963
964/* md_is_html_tag() may be called when processing inlines (inline raw HTML)
965 * or when breaking document to blocks (checking for start of HTML block type 7).
966 *
967 * When breaking document to blocks, we do not yet know line boundaries, but
968 * in that case the whole tag has to live on a single line. We distinguish this
969 * by n_lines == 0.
970 */
971static int
972md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
973{
974 int attr_state;
975 OFF off = beg;
976 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
977 int i = 0;
978
979 MD_ASSERT(CH(beg) == _T('<'));
980
981 if(off + 1 >= line_end)
982 return FALSE;
983 off++;
984
985 /* For parsing attributes, we need a little state automaton below.
986 * State -1: no attributes are allowed.
987 * State 0: attribute could follow after some whitespace.
988 * State 1: after a whitespace (attribute name may follow).
989 * State 2: after attribute name ('=' MAY follow).
990 * State 3: after '=' (value specification MUST follow).
991 * State 41: in middle of unquoted attribute value.
992 * State 42: in middle of single-quoted attribute value.
993 * State 43: in middle of double-quoted attribute value.
994 */
995 attr_state = 0;
996
997 if(CH(off) == _T('/')) {
998 /* Closer tag "</ ... >". No attributes may be present. */
999 attr_state = -1;
1000 off++;
1001 }
1002
1003 /* Tag name */
1004 if(off >= line_end || !ISALPHA(off))
1005 return FALSE;
1006 off++;
1007 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1008 off++;
1009
1010 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1011 * and final '>'. */
1012 while(1) {
1013 while(off < line_end && !ISNEWLINE(off)) {
1014 if(attr_state > 40) {
1015 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1016 attr_state = 0;
1017 off--; /* Put the char back for re-inspection in the new state. */
1018 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1019 attr_state = 0;
1020 } else if(attr_state == 43 && CH(off) == _T('"')) {
1021 attr_state = 0;
1022 }
1023 off++;
1024 } else if(ISWHITESPACE(off)) {
1025 if(attr_state == 0)
1026 attr_state = 1;
1027 off++;
1028 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1029 /* End. */
1030 goto done;
1031 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1032 /* End with digraph '/>' */
1033 off++;
1034 goto done;
1035 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1036 off++;
1037 /* Attribute name */
1038 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1039 off++;
1040 attr_state = 2;
1041 } else if(attr_state == 2 && CH(off) == _T('=')) {
1042 /* Attribute assignment sign */
1043 off++;
1044 attr_state = 3;
1045 } else if(attr_state == 3) {
1046 /* Expecting start of attribute value. */
1047 if(CH(off) == _T('"'))
1048 attr_state = 43;
1049 else if(CH(off) == _T('\''))
1050 attr_state = 42;
1051 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1052 attr_state = 41;
1053 else
1054 return FALSE;
1055 off++;
1056 } else {
1057 /* Anything unexpected. */
1058 return FALSE;
1059 }
1060 }
1061
1062 /* We have to be on a single line. See definition of start condition
1063 * of HTML block, type 7. */
1064 if(n_lines == 0)
1065 return FALSE;
1066
1067 i++;
1068 if(i >= n_lines)
1069 return FALSE;
1070
1071 off = lines[i].beg;
1072 line_end = lines[i].end;
1073
1074 if(attr_state == 0 || attr_state == 41)
1075 attr_state = 1;
1076
1077 if(off >= max_end)
1078 return FALSE;
1079 }
1080
1081done:
1082 if(off >= max_end)
1083 return FALSE;
1084
1085 *p_end = off+1;
1086 return TRUE;
1087}
1088
1089static int
1090md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1091 const MD_LINE* lines, int n_lines,
1092 OFF beg, OFF max_end, OFF* p_end,
1093 OFF* p_scan_horizon)
1094{
1095 OFF off = beg;
1096 int i = 0;
1097
1098 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1099 /* We have already scanned the range up to the max_end so we know
1100 * there is nothing to see. */
1101 return FALSE;
1102 }
1103
1104 while(TRUE) {
1105 while(off + len <= lines[i].end && off + len <= max_end) {
1106 if(md_ascii_eq(STR(off), s2: str, n: len)) {
1107 /* Success. */
1108 *p_end = off + len;
1109 return TRUE;
1110 }
1111 off++;
1112 }
1113
1114 i++;
1115 if(off >= max_end || i >= n_lines) {
1116 /* Failure. */
1117 *p_scan_horizon = off;
1118 return FALSE;
1119 }
1120
1121 off = lines[i].beg;
1122 }
1123}
1124
1125static int
1126md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1127{
1128 OFF off = beg;
1129
1130 MD_ASSERT(CH(beg) == _T('<'));
1131
1132 if(off + 4 >= lines[0].end)
1133 return FALSE;
1134 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1135 return FALSE;
1136 off += 4;
1137
1138 /* ">" and "->" must not follow the opening. */
1139 if(off < lines[0].end && CH(off) == _T('>'))
1140 return FALSE;
1141 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>'))
1142 return FALSE;
1143
1144 /* HTML comment must not contain "--", so we scan just for "--" instead
1145 * of "-->" and verify manually that '>' follows. */
1146 if(md_scan_for_html_closer(ctx, _T("--"), len: 2,
1147 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_comment_horizon))
1148 {
1149 if(*p_end < max_end && CH(*p_end) == _T('>')) {
1150 *p_end = *p_end + 1;
1151 return TRUE;
1152 }
1153 }
1154
1155 return FALSE;
1156}
1157
1158static int
1159md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1160{
1161 OFF off = beg;
1162
1163 if(off + 2 >= lines[0].end)
1164 return FALSE;
1165 if(CH(off+1) != _T('?'))
1166 return FALSE;
1167 off += 2;
1168
1169 return md_scan_for_html_closer(ctx, _T("?>"), len: 2,
1170 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_proc_instr_horizon);
1171}
1172
1173static int
1174md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1175{
1176 OFF off = beg;
1177
1178 if(off + 2 >= lines[0].end)
1179 return FALSE;
1180 if(CH(off+1) != _T('!'))
1181 return FALSE;
1182 off += 2;
1183
1184 /* Declaration name. */
1185 if(off >= lines[0].end || !ISALPHA(off))
1186 return FALSE;
1187 off++;
1188 while(off < lines[0].end && ISALPHA(off))
1189 off++;
1190 if(off < lines[0].end && !ISWHITESPACE(off))
1191 return FALSE;
1192
1193 return md_scan_for_html_closer(ctx, _T(">"), len: 1,
1194 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_decl_horizon);
1195}
1196
1197static int
1198md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1199{
1200 static const CHAR open_str[] = _T("<![CDATA[");
1201 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1202
1203 OFF off = beg;
1204
1205 if(off + open_size >= lines[0].end)
1206 return FALSE;
1207 if(memcmp(STR(off), s2: open_str, n: open_size) != 0)
1208 return FALSE;
1209 off += open_size;
1210
1211 if(lines[n_lines-1].end < max_end)
1212 max_end = lines[n_lines-1].end - 2;
1213
1214 return md_scan_for_html_closer(ctx, _T("]]>"), len: 3,
1215 lines, n_lines, beg: off, max_end, p_end, p_scan_horizon: &ctx->html_cdata_horizon);
1216}
1217
1218static int
1219md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1220{
1221 MD_ASSERT(CH(beg) == _T('<'));
1222 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1223 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1224 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1225 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1226 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1227}
1228
1229
1230/****************************
1231 *** Recognizing Entity ***
1232 ****************************/
1233
1234static int
1235md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1236{
1237 OFF off = beg;
1238
1239 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1240 off++;
1241
1242 if(1 <= off - beg && off - beg <= 6) {
1243 *p_end = off;
1244 return TRUE;
1245 } else {
1246 return FALSE;
1247 }
1248}
1249
1250static int
1251md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1252{
1253 OFF off = beg;
1254
1255 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1256 off++;
1257
1258 if(1 <= off - beg && off - beg <= 7) {
1259 *p_end = off;
1260 return TRUE;
1261 } else {
1262 return FALSE;
1263 }
1264}
1265
1266static int
1267md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1268{
1269 OFF off = beg;
1270
1271 if(off < max_end && ISALPHA_(text[off]))
1272 off++;
1273 else
1274 return FALSE;
1275
1276 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1277 off++;
1278
1279 if(2 <= off - beg && off - beg <= 48) {
1280 *p_end = off;
1281 return TRUE;
1282 } else {
1283 return FALSE;
1284 }
1285}
1286
1287static int
1288md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1289{
1290 int is_contents;
1291 OFF off = beg;
1292
1293 MD_ASSERT(text[off] == _T('&'));
1294 off++;
1295
1296 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1297 is_contents = md_is_hex_entity_contents(ctx, text, beg: off+2, max_end, p_end: &off);
1298 else if(off+1 < max_end && text[off] == _T('#'))
1299 is_contents = md_is_dec_entity_contents(ctx, text, beg: off+1, max_end, p_end: &off);
1300 else
1301 is_contents = md_is_named_entity_contents(ctx, text, beg: off, max_end, p_end: &off);
1302
1303 if(is_contents && off < max_end && text[off] == _T(';')) {
1304 *p_end = off+1;
1305 return TRUE;
1306 } else {
1307 return FALSE;
1308 }
1309}
1310
1311static inline int
1312md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1313{
1314 return md_is_entity_str(ctx, text: ctx->text, beg, max_end, p_end);
1315}
1316
1317
1318/******************************
1319 *** Attribute Management ***
1320 ******************************/
1321
1322typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1323struct MD_ATTRIBUTE_BUILD_tag {
1324 CHAR* text;
1325 MD_TEXTTYPE* substr_types;
1326 OFF* substr_offsets;
1327 int substr_count;
1328 int substr_alloc;
1329 MD_TEXTTYPE trivial_types[1];
1330 OFF trivial_offsets[2];
1331};
1332
1333
1334#define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1335
1336static int
1337md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1338 MD_TEXTTYPE type, OFF off)
1339{
1340 if(build->substr_count >= build->substr_alloc) {
1341 MD_TEXTTYPE* new_substr_types;
1342 OFF* new_substr_offsets;
1343
1344 build->substr_alloc = (build->substr_alloc > 0
1345 ? build->substr_alloc + build->substr_alloc / 2
1346 : 8);
1347 new_substr_types = (MD_TEXTTYPE*) realloc(ptr: build->substr_types,
1348 size: build->substr_alloc * sizeof(MD_TEXTTYPE));
1349 if(new_substr_types == NULL) {
1350 MD_LOG("realloc() failed.");
1351 return -1;
1352 }
1353 /* Note +1 to reserve space for final offset (== raw_size). */
1354 new_substr_offsets = (OFF*) realloc(ptr: build->substr_offsets,
1355 size: (build->substr_alloc+1) * sizeof(OFF));
1356 if(new_substr_offsets == NULL) {
1357 MD_LOG("realloc() failed.");
1358 free(ptr: new_substr_types);
1359 return -1;
1360 }
1361
1362 build->substr_types = new_substr_types;
1363 build->substr_offsets = new_substr_offsets;
1364 }
1365
1366 build->substr_types[build->substr_count] = type;
1367 build->substr_offsets[build->substr_count] = off;
1368 build->substr_count++;
1369 return 0;
1370}
1371
1372static void
1373md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1374{
1375 if(build->substr_alloc > 0) {
1376 free(ptr: build->text);
1377 free(ptr: build->substr_types);
1378 free(ptr: build->substr_offsets);
1379 }
1380}
1381
1382static int
1383md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1384 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1385{
1386 OFF raw_off, off;
1387 int is_trivial;
1388 int ret = 0;
1389
1390 memset(s: build, c: 0, n: sizeof(MD_ATTRIBUTE_BUILD));
1391
1392 /* If there is no backslash and no ampersand, build trivial attribute
1393 * without any malloc(). */
1394 is_trivial = TRUE;
1395 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1396 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1397 is_trivial = FALSE;
1398 break;
1399 }
1400 }
1401
1402 if(is_trivial) {
1403 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1404 build->substr_types = build->trivial_types;
1405 build->substr_offsets = build->trivial_offsets;
1406 build->substr_count = 1;
1407 build->substr_alloc = 0;
1408 build->trivial_types[0] = MD_TEXT_NORMAL;
1409 build->trivial_offsets[0] = 0;
1410 build->trivial_offsets[1] = raw_size;
1411 off = raw_size;
1412 } else {
1413 build->text = (CHAR*) malloc(size: raw_size * sizeof(CHAR));
1414 if(build->text == NULL) {
1415 MD_LOG("malloc() failed.");
1416 goto abort;
1417 }
1418
1419 raw_off = 0;
1420 off = 0;
1421
1422 while(raw_off < raw_size) {
1423 if(raw_text[raw_off] == _T('\0')) {
1424 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1425 memcpy(dest: build->text + off, src: raw_text + raw_off, n: 1);
1426 off++;
1427 raw_off++;
1428 continue;
1429 }
1430
1431 if(raw_text[raw_off] == _T('&')) {
1432 OFF ent_end;
1433
1434 if(md_is_entity_str(ctx, text: raw_text, beg: raw_off, max_end: raw_size, p_end: &ent_end)) {
1435 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1436 memcpy(dest: build->text + off, src: raw_text + raw_off, n: ent_end - raw_off);
1437 off += ent_end - raw_off;
1438 raw_off = ent_end;
1439 continue;
1440 }
1441 }
1442
1443 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1444 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1445
1446 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1447 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1448 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1449 raw_off++;
1450
1451 build->text[off++] = raw_text[raw_off++];
1452 }
1453 build->substr_offsets[build->substr_count] = off;
1454 }
1455
1456 attr->text = build->text;
1457 attr->size = off;
1458 attr->substr_offsets = build->substr_offsets;
1459 attr->substr_types = build->substr_types;
1460 return 0;
1461
1462abort:
1463 md_free_attribute(ctx, build);
1464 return -1;
1465}
1466
1467
1468/*********************************************
1469 *** Dictionary of Reference Definitions ***
1470 *********************************************/
1471
1472#define MD_FNV1A_BASE 2166136261U
1473#define MD_FNV1A_PRIME 16777619U
1474
1475static inline unsigned
1476md_fnv1a(unsigned base, const void* data, size_t n)
1477{
1478 const unsigned char* buf = (const unsigned char*) data;
1479 unsigned hash = base;
1480 size_t i;
1481
1482 for(i = 0; i < n; i++) {
1483 hash ^= buf[i];
1484 hash *= MD_FNV1A_PRIME;
1485 }
1486
1487 return hash;
1488}
1489
1490
1491struct MD_REF_DEF_tag {
1492 CHAR* label;
1493 CHAR* title;
1494 unsigned hash;
1495 SZ label_size;
1496 SZ title_size;
1497 OFF dest_beg;
1498 OFF dest_end;
1499 unsigned char label_needs_free : 1;
1500 unsigned char title_needs_free : 1;
1501};
1502
1503/* Label equivalence is quite complicated with regards to whitespace and case
1504 * folding. This complicates computing a hash of it as well as direct comparison
1505 * of two labels. */
1506
1507static unsigned
1508md_link_label_hash(const CHAR* label, SZ size)
1509{
1510 unsigned hash = MD_FNV1A_BASE;
1511 OFF off;
1512 unsigned codepoint;
1513 int is_whitespace = FALSE;
1514
1515 off = md_skip_unicode_whitespace(label, off: 0, size);
1516 while(off < size) {
1517 SZ char_size;
1518
1519 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1520 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1521
1522 if(is_whitespace) {
1523 codepoint = ' ';
1524 hash = md_fnv1a(base: hash, data: &codepoint, n: sizeof(unsigned));
1525 off = md_skip_unicode_whitespace(label, off, size);
1526 } else {
1527 MD_UNICODE_FOLD_INFO fold_info;
1528
1529 md_get_unicode_fold_info(codepoint, info: &fold_info);
1530 hash = md_fnv1a(base: hash, data: fold_info.codepoints, n: fold_info.n_codepoints * sizeof(unsigned));
1531 off += char_size;
1532 }
1533 }
1534
1535 return hash;
1536}
1537
1538static OFF
1539md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1540 MD_UNICODE_FOLD_INFO* fold_info)
1541{
1542 unsigned codepoint;
1543 SZ char_size;
1544
1545 if(off >= size) {
1546 /* Treat end of a link label as a whitespace. */
1547 goto whitespace;
1548 }
1549
1550 if(ISNEWLINE_(label[off])) {
1551 /* Treat new lines as a whitespace. */
1552 off++;
1553 goto whitespace;
1554 }
1555
1556 codepoint = md_decode_unicode(str: label, off, str_size: size, p_char_size: &char_size);
1557 off += char_size;
1558 if(ISUNICODEWHITESPACE_(codepoint)) {
1559 /* Treat all whitespace as equivalent */
1560 goto whitespace;
1561 }
1562
1563 /* Get real folding info. */
1564 md_get_unicode_fold_info(codepoint, info: fold_info);
1565 return off;
1566
1567whitespace:
1568 fold_info->codepoints[0] = _T(' ');
1569 fold_info->n_codepoints = 1;
1570 return md_skip_unicode_whitespace(label, off, size);
1571}
1572
1573static int
1574md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1575{
1576 OFF a_off;
1577 OFF b_off;
1578 int a_reached_end = FALSE;
1579 int b_reached_end = FALSE;
1580 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1581 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1582 OFF a_fi_off = 0;
1583 OFF b_fi_off = 0;
1584 int cmp;
1585
1586 a_off = md_skip_unicode_whitespace(label: a_label, off: 0, size: a_size);
1587 b_off = md_skip_unicode_whitespace(label: b_label, off: 0, size: b_size);
1588 while(!a_reached_end || !b_reached_end) {
1589 /* If needed, load fold info for next char. */
1590 if(a_fi_off >= a_fi.n_codepoints) {
1591 a_fi_off = 0;
1592 a_off = md_link_label_cmp_load_fold_info(label: a_label, off: a_off, size: a_size, fold_info: &a_fi);
1593 a_reached_end = (a_off >= a_size);
1594 }
1595 if(b_fi_off >= b_fi.n_codepoints) {
1596 b_fi_off = 0;
1597 b_off = md_link_label_cmp_load_fold_info(label: b_label, off: b_off, size: b_size, fold_info: &b_fi);
1598 b_reached_end = (b_off >= b_size);
1599 }
1600
1601 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1602 if(cmp != 0)
1603 return cmp;
1604
1605 a_fi_off++;
1606 b_fi_off++;
1607 }
1608
1609 return 0;
1610}
1611
1612typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1613struct MD_REF_DEF_LIST_tag {
1614 int n_ref_defs;
1615 int alloc_ref_defs;
1616 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1617};
1618
1619static int
1620md_ref_def_cmp(const void* a, const void* b)
1621{
1622 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1623 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1624
1625 if(a_ref->hash < b_ref->hash)
1626 return -1;
1627 else if(a_ref->hash > b_ref->hash)
1628 return +1;
1629 else
1630 return md_link_label_cmp(a_label: a_ref->label, a_size: a_ref->label_size, b_label: b_ref->label, b_size: b_ref->label_size);
1631}
1632
1633static int
1634md_ref_def_cmp_for_sort(const void* a, const void* b)
1635{
1636 int cmp;
1637
1638 cmp = md_ref_def_cmp(a, b);
1639
1640 /* Ensure stability of the sorting. */
1641 if(cmp == 0) {
1642 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1643 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1644
1645 if(a_ref < b_ref)
1646 cmp = -1;
1647 else if(a_ref > b_ref)
1648 cmp = +1;
1649 else
1650 cmp = 0;
1651 }
1652
1653 return cmp;
1654}
1655
1656static int
1657md_build_ref_def_hashtable(MD_CTX* ctx)
1658{
1659 int i, j;
1660
1661 if(ctx->n_ref_defs == 0)
1662 return 0;
1663
1664 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1665 ctx->ref_def_hashtable = malloc(size: ctx->ref_def_hashtable_size * sizeof(void*));
1666 if(ctx->ref_def_hashtable == NULL) {
1667 MD_LOG("malloc() failed.");
1668 goto abort;
1669 }
1670 memset(s: ctx->ref_def_hashtable, c: 0, n: ctx->ref_def_hashtable_size * sizeof(void*));
1671
1672 /* Each member of ctx->ref_def_hashtable[] can be:
1673 * -- NULL,
1674 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1675 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1676 * such MD_REF_DEFs.
1677 */
1678 for(i = 0; i < ctx->n_ref_defs; i++) {
1679 MD_REF_DEF* def = &ctx->ref_defs[i];
1680 void* bucket;
1681 MD_REF_DEF_LIST* list;
1682
1683 def->hash = md_link_label_hash(label: def->label, size: def->label_size);
1684 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1685
1686 if(bucket == NULL) {
1687 /* The bucket is empty. Make it just point to the def. */
1688 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1689 continue;
1690 }
1691
1692 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1693 /* The bucket already contains one ref. def. Lets see whether it
1694 * is the same label (ref. def. duplicate) or different one
1695 * (hash conflict). */
1696 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1697
1698 if(md_link_label_cmp(a_label: def->label, a_size: def->label_size, b_label: old_def->label, b_size: old_def->label_size) == 0) {
1699 /* Duplicate label: Ignore this ref. def. */
1700 continue;
1701 }
1702
1703 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1704 list = (MD_REF_DEF_LIST*) malloc(size: sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1705 if(list == NULL) {
1706 MD_LOG("malloc() failed.");
1707 goto abort;
1708 }
1709 list->ref_defs[0] = old_def;
1710 list->ref_defs[1] = def;
1711 list->n_ref_defs = 2;
1712 list->alloc_ref_defs = 2;
1713 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1714 continue;
1715 }
1716
1717 /* Append the def to the complex bucket list.
1718 *
1719 * Note in this case we ignore potential duplicates to avoid expensive
1720 * iterating over the complex bucket. Below, we revisit all the complex
1721 * buckets and handle it more cheaply after the complex bucket contents
1722 * is sorted. */
1723 list = (MD_REF_DEF_LIST*) bucket;
1724 if(list->n_ref_defs >= list->alloc_ref_defs) {
1725 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1726 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(ptr: list,
1727 size: sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1728 if(list_tmp == NULL) {
1729 MD_LOG("realloc() failed.");
1730 goto abort;
1731 }
1732 list = list_tmp;
1733 list->alloc_ref_defs = alloc_ref_defs;
1734 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1735 }
1736
1737 list->ref_defs[list->n_ref_defs] = def;
1738 list->n_ref_defs++;
1739 }
1740
1741 /* Sort the complex buckets so we can use bsearch() with them. */
1742 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1743 void* bucket = ctx->ref_def_hashtable[i];
1744 MD_REF_DEF_LIST* list;
1745
1746 if(bucket == NULL)
1747 continue;
1748 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1749 continue;
1750
1751 list = (MD_REF_DEF_LIST*) bucket;
1752 qsort(base: list->ref_defs, nmemb: list->n_ref_defs, size: sizeof(MD_REF_DEF*), compar: md_ref_def_cmp_for_sort);
1753
1754 /* Disable all duplicates in the complex bucket by forcing all such
1755 * records to point to the 1st such ref. def. I.e. no matter which
1756 * record is found during the lookup, it will always point to the right
1757 * ref. def. in ctx->ref_defs[]. */
1758 for(j = 1; j < list->n_ref_defs; j++) {
1759 if(md_ref_def_cmp(a: &list->ref_defs[j-1], b: &list->ref_defs[j]) == 0)
1760 list->ref_defs[j] = list->ref_defs[j-1];
1761 }
1762 }
1763
1764 return 0;
1765
1766abort:
1767 return -1;
1768}
1769
1770static void
1771md_free_ref_def_hashtable(MD_CTX* ctx)
1772{
1773 if(ctx->ref_def_hashtable != NULL) {
1774 int i;
1775
1776 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1777 void* bucket = ctx->ref_def_hashtable[i];
1778 if(bucket == NULL)
1779 continue;
1780 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1781 continue;
1782 free(ptr: bucket);
1783 }
1784
1785 free(ptr: ctx->ref_def_hashtable);
1786 }
1787}
1788
1789static const MD_REF_DEF*
1790md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1791{
1792 unsigned hash;
1793 void* bucket;
1794
1795 if(ctx->ref_def_hashtable_size == 0)
1796 return NULL;
1797
1798 hash = md_link_label_hash(label, size: label_size);
1799 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1800
1801 if(bucket == NULL) {
1802 return NULL;
1803 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1804 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1805
1806 if(md_link_label_cmp(a_label: def->label, a_size: def->label_size, b_label: label, b_size: label_size) == 0)
1807 return def;
1808 else
1809 return NULL;
1810 } else {
1811 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1812 MD_REF_DEF key_buf;
1813 const MD_REF_DEF* key = &key_buf;
1814 const MD_REF_DEF** ret;
1815
1816 key_buf.label = (CHAR*) label;
1817 key_buf.label_size = label_size;
1818 key_buf.hash = md_link_label_hash(label: key_buf.label, size: key_buf.label_size);
1819
1820 ret = (const MD_REF_DEF**) bsearch(key: &key, base: list->ref_defs,
1821 nmemb: list->n_ref_defs, size: sizeof(MD_REF_DEF*), compar: md_ref_def_cmp);
1822 if(ret != NULL)
1823 return *ret;
1824 else
1825 return NULL;
1826 }
1827}
1828
1829
1830/***************************
1831 *** Recognizing Links ***
1832 ***************************/
1833
1834/* Note this code is partially shared between processing inlines and blocks
1835 * as reference definitions and links share some helper parser functions.
1836 */
1837
1838typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1839struct MD_LINK_ATTR_tag {
1840 OFF dest_beg;
1841 OFF dest_end;
1842
1843 CHAR* title;
1844 SZ title_size;
1845 int title_needs_free;
1846};
1847
1848
1849static int
1850md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1851 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1852 OFF* p_contents_beg, OFF* p_contents_end)
1853{
1854 OFF off = beg;
1855 OFF contents_beg = 0;
1856 OFF contents_end = 0;
1857 int line_index = 0;
1858 int len = 0;
1859
1860 if(CH(off) != _T('['))
1861 return FALSE;
1862 off++;
1863
1864 while(1) {
1865 OFF line_end = lines[line_index].end;
1866
1867 while(off < line_end) {
1868 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1869 if(contents_end == 0) {
1870 contents_beg = off;
1871 *p_beg_line_index = line_index;
1872 }
1873 contents_end = off + 2;
1874 off += 2;
1875 } else if(CH(off) == _T('[')) {
1876 return FALSE;
1877 } else if(CH(off) == _T(']')) {
1878 if(contents_beg < contents_end) {
1879 /* Success. */
1880 *p_contents_beg = contents_beg;
1881 *p_contents_end = contents_end;
1882 *p_end = off+1;
1883 *p_end_line_index = line_index;
1884 return TRUE;
1885 } else {
1886 /* Link label must have some non-whitespace contents. */
1887 return FALSE;
1888 }
1889 } else {
1890 unsigned codepoint;
1891 SZ char_size;
1892
1893 codepoint = md_decode_unicode(str: ctx->text, off, str_size: ctx->size, p_char_size: &char_size);
1894 if(!ISUNICODEWHITESPACE_(codepoint)) {
1895 if(contents_end == 0) {
1896 contents_beg = off;
1897 *p_beg_line_index = line_index;
1898 }
1899 contents_end = off + char_size;
1900 }
1901
1902 off += char_size;
1903 }
1904
1905 len++;
1906 if(len > 999)
1907 return FALSE;
1908 }
1909
1910 line_index++;
1911 len++;
1912 if(line_index < n_lines)
1913 off = lines[line_index].beg;
1914 else
1915 break;
1916 }
1917
1918 return FALSE;
1919}
1920
1921static int
1922md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1923 OFF* p_contents_beg, OFF* p_contents_end)
1924{
1925 OFF off = beg;
1926
1927 if(off >= max_end || CH(off) != _T('<'))
1928 return FALSE;
1929 off++;
1930
1931 while(off < max_end) {
1932 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1933 off += 2;
1934 continue;
1935 }
1936
1937 if(ISNEWLINE(off) || CH(off) == _T('<'))
1938 return FALSE;
1939
1940 if(CH(off) == _T('>')) {
1941 /* Success. */
1942 *p_contents_beg = beg+1;
1943 *p_contents_end = off;
1944 *p_end = off+1;
1945 return TRUE;
1946 }
1947
1948 off++;
1949 }
1950
1951 return FALSE;
1952}
1953
1954static int
1955md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1956 OFF* p_contents_beg, OFF* p_contents_end)
1957{
1958 OFF off = beg;
1959 int parenthesis_level = 0;
1960
1961 while(off < max_end) {
1962 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1963 off += 2;
1964 continue;
1965 }
1966
1967 if(ISWHITESPACE(off) || ISCNTRL(off))
1968 break;
1969
1970 /* Link destination may include balanced pairs of unescaped '(' ')'.
1971 * Note we limit the maximal nesting level by 32 to protect us from
1972 * https://github.com/jgm/cmark/issues/214 */
1973 if(CH(off) == _T('(')) {
1974 parenthesis_level++;
1975 if(parenthesis_level > 32)
1976 return FALSE;
1977 } else if(CH(off) == _T(')')) {
1978 if(parenthesis_level == 0)
1979 break;
1980 parenthesis_level--;
1981 }
1982
1983 off++;
1984 }
1985
1986 if(parenthesis_level != 0 || off == beg)
1987 return FALSE;
1988
1989 /* Success. */
1990 *p_contents_beg = beg;
1991 *p_contents_end = off;
1992 *p_end = off;
1993 return TRUE;
1994}
1995
1996static inline int
1997md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1998 OFF* p_contents_beg, OFF* p_contents_end)
1999{
2000 if(CH(beg) == _T('<'))
2001 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2002 else
2003 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2004}
2005
2006static int
2007md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2008 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2009 OFF* p_contents_beg, OFF* p_contents_end)
2010{
2011 OFF off = beg;
2012 CHAR closer_char;
2013 int line_index = 0;
2014
2015 /* White space with up to one line break. */
2016 while(off < lines[line_index].end && ISWHITESPACE(off))
2017 off++;
2018 if(off >= lines[line_index].end) {
2019 line_index++;
2020 if(line_index >= n_lines)
2021 return FALSE;
2022 off = lines[line_index].beg;
2023 }
2024 if(off == beg)
2025 return FALSE;
2026
2027 *p_beg_line_index = line_index;
2028
2029 /* First char determines how to detect end of it. */
2030 switch(CH(off)) {
2031 case _T('"'): closer_char = _T('"'); break;
2032 case _T('\''): closer_char = _T('\''); break;
2033 case _T('('): closer_char = _T(')'); break;
2034 default: return FALSE;
2035 }
2036 off++;
2037
2038 *p_contents_beg = off;
2039
2040 while(line_index < n_lines) {
2041 OFF line_end = lines[line_index].end;
2042
2043 while(off < line_end) {
2044 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2045 off++;
2046 } else if(CH(off) == closer_char) {
2047 /* Success. */
2048 *p_contents_end = off;
2049 *p_end = off+1;
2050 *p_end_line_index = line_index;
2051 return TRUE;
2052 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2053 /* ()-style title cannot contain (unescaped '(')) */
2054 return FALSE;
2055 }
2056
2057 off++;
2058 }
2059
2060 line_index++;
2061 }
2062
2063 return FALSE;
2064}
2065
2066/* Returns 0 if it is not a reference definition.
2067 *
2068 * Returns N > 0 if it is a reference definition. N then corresponds to the
2069 * number of lines forming it). In this case the definition is stored for
2070 * resolving any links referring to it.
2071 *
2072 * Returns -1 in case of an error (out of memory).
2073 */
2074static int
2075md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2076{
2077 OFF label_contents_beg;
2078 OFF label_contents_end;
2079 int label_contents_line_index = -1;
2080 int label_is_multiline = FALSE;
2081 OFF dest_contents_beg;
2082 OFF dest_contents_end;
2083 OFF title_contents_beg;
2084 OFF title_contents_end;
2085 int title_contents_line_index;
2086 int title_is_multiline = FALSE;
2087 OFF off;
2088 int line_index = 0;
2089 int tmp_line_index;
2090 MD_REF_DEF* def = NULL;
2091 int ret = 0;
2092
2093 /* Link label. */
2094 if(!md_is_link_label(ctx, lines, n_lines, beg: lines[0].beg,
2095 p_end: &off, p_beg_line_index: &label_contents_line_index, p_end_line_index: &line_index,
2096 p_contents_beg: &label_contents_beg, p_contents_end: &label_contents_end))
2097 return FALSE;
2098 label_is_multiline = (label_contents_line_index != line_index);
2099
2100 /* Colon. */
2101 if(off >= lines[line_index].end || CH(off) != _T(':'))
2102 return FALSE;
2103 off++;
2104
2105 /* Optional white space with up to one line break. */
2106 while(off < lines[line_index].end && ISWHITESPACE(off))
2107 off++;
2108 if(off >= lines[line_index].end) {
2109 line_index++;
2110 if(line_index >= n_lines)
2111 return FALSE;
2112 off = lines[line_index].beg;
2113 }
2114
2115 /* Link destination. */
2116 if(!md_is_link_destination(ctx, beg: off, max_end: lines[line_index].end,
2117 p_end: &off, p_contents_beg: &dest_contents_beg, p_contents_end: &dest_contents_end))
2118 return FALSE;
2119
2120 /* (Optional) title. Note we interpret it as an title only if nothing
2121 * more follows on its last line. */
2122 if(md_is_link_title(ctx, lines: lines + line_index, n_lines: n_lines - line_index, beg: off,
2123 p_end: &off, p_beg_line_index: &title_contents_line_index, p_end_line_index: &tmp_line_index,
2124 p_contents_beg: &title_contents_beg, p_contents_end: &title_contents_end)
2125 && off >= lines[line_index + tmp_line_index].end)
2126 {
2127 title_is_multiline = (tmp_line_index != title_contents_line_index);
2128 title_contents_line_index += line_index;
2129 line_index += tmp_line_index;
2130 } else {
2131 /* Not a title. */
2132 title_is_multiline = FALSE;
2133 title_contents_beg = off;
2134 title_contents_end = off;
2135 title_contents_line_index = 0;
2136 }
2137
2138 /* Nothing more can follow on the last line. */
2139 if(off < lines[line_index].end)
2140 return FALSE;
2141
2142 /* So, it _is_ a reference definition. Remember it. */
2143 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2144 MD_REF_DEF* new_defs;
2145
2146 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2147 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2148 : 16);
2149 new_defs = (MD_REF_DEF*) realloc(ptr: ctx->ref_defs, size: ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2150 if(new_defs == NULL) {
2151 MD_LOG("realloc() failed.");
2152 goto abort;
2153 }
2154
2155 ctx->ref_defs = new_defs;
2156 }
2157 def = &ctx->ref_defs[ctx->n_ref_defs];
2158 memset(s: def, c: 0, n: sizeof(MD_REF_DEF));
2159
2160 if(label_is_multiline) {
2161 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2162 lines + label_contents_line_index, n_lines - label_contents_line_index,
2163 _T(' '), &def->label, &def->label_size));
2164 def->label_needs_free = TRUE;
2165 } else {
2166 def->label = (CHAR*) STR(label_contents_beg);
2167 def->label_size = label_contents_end - label_contents_beg;
2168 }
2169
2170 if(title_is_multiline) {
2171 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2172 lines + title_contents_line_index, n_lines - title_contents_line_index,
2173 _T('\n'), &def->title, &def->title_size));
2174 def->title_needs_free = TRUE;
2175 } else {
2176 def->title = (CHAR*) STR(title_contents_beg);
2177 def->title_size = title_contents_end - title_contents_beg;
2178 }
2179
2180 def->dest_beg = dest_contents_beg;
2181 def->dest_end = dest_contents_end;
2182
2183 /* Success. */
2184 ctx->n_ref_defs++;
2185 return line_index + 1;
2186
2187abort:
2188 /* Failure. */
2189 if(def != NULL && def->label_needs_free)
2190 free(ptr: def->label);
2191 if(def != NULL && def->title_needs_free)
2192 free(ptr: def->title);
2193 return ret;
2194}
2195
2196static int
2197md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2198 OFF beg, OFF end, MD_LINK_ATTR* attr)
2199{
2200 const MD_REF_DEF* def;
2201 const MD_LINE* beg_line;
2202 const MD_LINE* end_line;
2203 CHAR* label;
2204 SZ label_size;
2205 int ret;
2206
2207 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2208 MD_ASSERT(CH(end-1) == _T(']'));
2209
2210 beg += (CH(beg) == _T('!') ? 2 : 1);
2211 end--;
2212
2213 /* Find lines corresponding to the beg and end positions. */
2214 MD_ASSERT(lines[0].beg <= beg);
2215 beg_line = lines;
2216 while(beg >= beg_line->end)
2217 beg_line++;
2218
2219 MD_ASSERT(end <= lines[n_lines-1].end);
2220 end_line = beg_line;
2221 while(end >= end_line->end)
2222 end_line++;
2223
2224 if(beg_line != end_line) {
2225 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2226 n_lines - (beg_line - lines), _T(' '), &label, &label_size));
2227 } else {
2228 label = (CHAR*) STR(beg);
2229 label_size = end - beg;
2230 }
2231
2232 def = md_lookup_ref_def(ctx, label, label_size);
2233 if(def != NULL) {
2234 attr->dest_beg = def->dest_beg;
2235 attr->dest_end = def->dest_end;
2236 attr->title = def->title;
2237 attr->title_size = def->title_size;
2238 attr->title_needs_free = FALSE;
2239 }
2240
2241 if(beg_line != end_line)
2242 free(ptr: label);
2243
2244 ret = (def != NULL);
2245
2246abort:
2247 return ret;
2248}
2249
2250static int
2251md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2252 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2253{
2254 int line_index = 0;
2255 int tmp_line_index;
2256 OFF title_contents_beg;
2257 OFF title_contents_end;
2258 int title_contents_line_index;
2259 int title_is_multiline;
2260 OFF off = beg;
2261 int ret = FALSE;
2262
2263 while(off >= lines[line_index].end)
2264 line_index++;
2265
2266 MD_ASSERT(CH(off) == _T('('));
2267 off++;
2268
2269 /* Optional white space with up to one line break. */
2270 while(off < lines[line_index].end && ISWHITESPACE(off))
2271 off++;
2272 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2273 line_index++;
2274 if(line_index >= n_lines)
2275 return FALSE;
2276 off = lines[line_index].beg;
2277 }
2278
2279 /* Link destination may be omitted, but only when not also having a title. */
2280 if(off < ctx->size && CH(off) == _T(')')) {
2281 attr->dest_beg = off;
2282 attr->dest_end = off;
2283 attr->title = NULL;
2284 attr->title_size = 0;
2285 attr->title_needs_free = FALSE;
2286 off++;
2287 *p_end = off;
2288 return TRUE;
2289 }
2290
2291 /* Link destination. */
2292 if(!md_is_link_destination(ctx, beg: off, max_end: lines[line_index].end,
2293 p_end: &off, p_contents_beg: &attr->dest_beg, p_contents_end: &attr->dest_end))
2294 return FALSE;
2295
2296 /* (Optional) title. */
2297 if(md_is_link_title(ctx, lines: lines + line_index, n_lines: n_lines - line_index, beg: off,
2298 p_end: &off, p_beg_line_index: &title_contents_line_index, p_end_line_index: &tmp_line_index,
2299 p_contents_beg: &title_contents_beg, p_contents_end: &title_contents_end))
2300 {
2301 title_is_multiline = (tmp_line_index != title_contents_line_index);
2302 title_contents_line_index += line_index;
2303 line_index += tmp_line_index;
2304 } else {
2305 /* Not a title. */
2306 title_is_multiline = FALSE;
2307 title_contents_beg = off;
2308 title_contents_end = off;
2309 title_contents_line_index = 0;
2310 }
2311
2312 /* Optional whitespace followed with final ')'. */
2313 while(off < lines[line_index].end && ISWHITESPACE(off))
2314 off++;
2315 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2316 line_index++;
2317 if(line_index >= n_lines)
2318 return FALSE;
2319 off = lines[line_index].beg;
2320 }
2321 if(CH(off) != _T(')'))
2322 goto abort;
2323 off++;
2324
2325 if(title_contents_beg >= title_contents_end) {
2326 attr->title = NULL;
2327 attr->title_size = 0;
2328 attr->title_needs_free = FALSE;
2329 } else if(!title_is_multiline) {
2330 attr->title = (CHAR*) STR(title_contents_beg);
2331 attr->title_size = title_contents_end - title_contents_beg;
2332 attr->title_needs_free = FALSE;
2333 } else {
2334 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2335 lines + title_contents_line_index, n_lines - title_contents_line_index,
2336 _T('\n'), &attr->title, &attr->title_size));
2337 attr->title_needs_free = TRUE;
2338 }
2339
2340 *p_end = off;
2341 ret = TRUE;
2342
2343abort:
2344 return ret;
2345}
2346
2347static void
2348md_free_ref_defs(MD_CTX* ctx)
2349{
2350 int i;
2351
2352 for(i = 0; i < ctx->n_ref_defs; i++) {
2353 MD_REF_DEF* def = &ctx->ref_defs[i];
2354
2355 if(def->label_needs_free)
2356 free(ptr: def->label);
2357 if(def->title_needs_free)
2358 free(ptr: def->title);
2359 }
2360
2361 free(ptr: ctx->ref_defs);
2362}
2363
2364
2365/******************************************
2366 *** Processing Inlines (a.k.a Spans) ***
2367 ******************************************/
2368
2369/* We process inlines in few phases:
2370 *
2371 * (1) We go through the block text and collect all significant characters
2372 * which may start/end a span or some other significant position into
2373 * ctx->marks[]. Core of this is what md_collect_marks() does.
2374 *
2375 * We also do some very brief preliminary context-less analysis, whether
2376 * it might be opener or closer (e.g. of an emphasis span).
2377 *
2378 * This speeds the other steps as we do not need to re-iterate over all
2379 * characters anymore.
2380 *
2381 * (2) We analyze each potential mark types, in order by their precedence.
2382 *
2383 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2384 * skipping already resolved regions (in preceding precedences) and try to
2385 * resolve them.
2386 *
2387 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2388 * them as resolved.
2389 *
2390 * (2.2) For range-type marks, we analyze whether the mark could be closer
2391 * and, if yes, whether there is some preceding opener it could satisfy.
2392 *
2393 * If not we check whether it could be really an opener and if yes, we
2394 * remember it so subsequent closers may resolve it.
2395 *
2396 * (3) Finally, when all marks were analyzed, we render the block contents
2397 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2398 * or ::close_span() whenever we reach a resolved mark.
2399 */
2400
2401
2402/* The mark structure.
2403 *
2404 * '\\': Maybe escape sequence.
2405 * '\0': NULL char.
2406 * '*': Maybe (strong) emphasis start/end.
2407 * '_': Maybe (strong) emphasis start/end.
2408 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2409 * '`': Maybe code span start/end.
2410 * '&': Maybe start of entity.
2411 * ';': Maybe end of entity.
2412 * '<': Maybe start of raw HTML or autolink.
2413 * '>': Maybe end of raw HTML or autolink.
2414 * '[': Maybe start of link label or link text.
2415 * '!': Equivalent of '[' for image.
2416 * ']': Maybe end of link label or link text.
2417 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2418 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2419 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2420 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2421 * (e.g. emphasis) or to make more space for storing some special data
2422 * related to the preceding mark (e.g. link).
2423 *
2424 * Note that not all instances of these chars in the text imply creation of the
2425 * structure. Only those which have (or may have, after we see more context)
2426 * the special meaning.
2427 *
2428 * (Keep this struct as small as possible to fit as much of them into CPU
2429 * cache line.)
2430 */
2431struct MD_MARK_tag {
2432 OFF beg;
2433 OFF end;
2434
2435 /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2436 * of given type 'ch'.
2437 *
2438 * During resolving, we disconnect from the chain and point to the
2439 * corresponding counterpart so opener points to its closer and vice versa.
2440 */
2441 int prev;
2442 int next;
2443 CHAR ch;
2444 unsigned char flags;
2445};
2446
2447/* Mark flags (these apply to ALL mark types). */
2448#define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2449#define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2450#define MD_MARK_OPENER 0x04 /* Definitely opener. */
2451#define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2452#define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2453
2454/* Mark flags specific for various mark types (so they can share bits). */
2455#define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
2456#define MD_MARK_EMPH_MOD3_0 0x40
2457#define MD_MARK_EMPH_MOD3_1 0x80
2458#define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2459#define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2460#define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2461#define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2462
2463static MD_MARKCHAIN*
2464md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2465{
2466 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2467 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
2468 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
2469 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
2470 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
2471 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
2472 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
2473 default: MD_UNREACHABLE();
2474 }
2475 return NULL;
2476}
2477
2478static MD_MARKCHAIN*
2479md_mark_chain(MD_CTX* ctx, int mark_index)
2480{
2481 MD_MARK* mark = &ctx->marks[mark_index];
2482
2483 switch(mark->ch) {
2484 case _T('*'): return md_asterisk_chain(ctx, flags: mark->flags);
2485 case _T('_'): return &UNDERSCORE_OPENERS;
2486 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2487 case _T('['): return &BRACKET_OPENERS;
2488 case _T('|'): return &TABLECELLBOUNDARIES;
2489 default: return NULL;
2490 }
2491}
2492
2493static MD_MARK*
2494md_push_mark(MD_CTX* ctx)
2495{
2496 if(ctx->n_marks >= ctx->alloc_marks) {
2497 MD_MARK* new_marks;
2498
2499 ctx->alloc_marks = (ctx->alloc_marks > 0
2500 ? ctx->alloc_marks + ctx->alloc_marks / 2
2501 : 64);
2502 new_marks = realloc(ptr: ctx->marks, size: ctx->alloc_marks * sizeof(MD_MARK));
2503 if(new_marks == NULL) {
2504 MD_LOG("realloc() failed.");
2505 return NULL;
2506 }
2507
2508 ctx->marks = new_marks;
2509 }
2510
2511 return &ctx->marks[ctx->n_marks++];
2512}
2513
2514#define PUSH_MARK_() \
2515 do { \
2516 mark = md_push_mark(ctx); \
2517 if(mark == NULL) { \
2518 ret = -1; \
2519 goto abort; \
2520 } \
2521 } while(0)
2522
2523#define PUSH_MARK(ch_, beg_, end_, flags_) \
2524 do { \
2525 PUSH_MARK_(); \
2526 mark->beg = (beg_); \
2527 mark->end = (end_); \
2528 mark->prev = -1; \
2529 mark->next = -1; \
2530 mark->ch = (char)(ch_); \
2531 mark->flags = (flags_); \
2532 } while(0)
2533
2534
2535static void
2536md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2537{
2538 if(chain->tail >= 0)
2539 ctx->marks[chain->tail].next = mark_index;
2540 else
2541 chain->head = mark_index;
2542
2543 ctx->marks[mark_index].prev = chain->tail;
2544 ctx->marks[mark_index].next = -1;
2545 chain->tail = mark_index;
2546}
2547
2548/* Sometimes, we need to store a pointer into the mark. It is quite rare
2549 * so we do not bother to make MD_MARK use union, and it can only happen
2550 * for dummy marks. */
2551static inline void
2552md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2553{
2554 MD_MARK* mark = &ctx->marks[mark_index];
2555 MD_ASSERT(mark->ch == 'D');
2556
2557 /* Check only members beg and end are misused for this. */
2558 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2559 memcpy(dest: mark, src: &ptr, n: sizeof(void*));
2560}
2561
2562static inline void*
2563md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2564{
2565 void* ptr;
2566 MD_MARK* mark = &ctx->marks[mark_index];
2567 MD_ASSERT(mark->ch == 'D');
2568 memcpy(dest: &ptr, src: mark, n: sizeof(void*));
2569 return ptr;
2570}
2571
2572static void
2573md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2574{
2575 MD_MARK* opener = &ctx->marks[opener_index];
2576 MD_MARK* closer = &ctx->marks[closer_index];
2577
2578 /* Remove opener from the list of openers. */
2579 if(chain != NULL) {
2580 if(opener->prev >= 0)
2581 ctx->marks[opener->prev].next = opener->next;
2582 else
2583 chain->head = opener->next;
2584
2585 if(opener->next >= 0)
2586 ctx->marks[opener->next].prev = opener->prev;
2587 else
2588 chain->tail = opener->prev;
2589 }
2590
2591 /* Interconnect opener and closer and mark both as resolved. */
2592 opener->next = closer_index;
2593 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2594 closer->prev = opener_index;
2595 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2596}
2597
2598
2599#define MD_ROLLBACK_ALL 0
2600#define MD_ROLLBACK_CROSSING 1
2601
2602/* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2603 * resolvings accordingly to these rules:
2604 *
2605 * (1) All openers BEFORE the range corresponding to any closer inside the
2606 * range are un-resolved and they are re-added to their respective chains
2607 * of unresolved openers. This ensures we can reuse the opener for closers
2608 * AFTER the range.
2609 *
2610 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2611 * are discarded.
2612 *
2613 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2614 * in (1) are discarded. I.e. pairs of openers and closers which are both
2615 * inside the range are retained as well as any unpaired marks.
2616 */
2617static void
2618md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2619{
2620 int i;
2621 int mark_index;
2622
2623 /* Cut all unresolved openers at the mark index. */
2624 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2625 MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2626
2627 while(chain->tail >= opener_index)
2628 chain->tail = ctx->marks[chain->tail].prev;
2629
2630 if(chain->tail >= 0)
2631 ctx->marks[chain->tail].next = -1;
2632 else
2633 chain->head = -1;
2634 }
2635
2636 /* Go backwards so that unresolved openers are re-added into their
2637 * respective chains, in the right order. */
2638 mark_index = closer_index - 1;
2639 while(mark_index > opener_index) {
2640 MD_MARK* mark = &ctx->marks[mark_index];
2641 int mark_flags = mark->flags;
2642 int discard_flag = (how == MD_ROLLBACK_ALL);
2643
2644 if(mark->flags & MD_MARK_CLOSER) {
2645 int mark_opener_index = mark->prev;
2646
2647 /* Undo opener BEFORE the range. */
2648 if(mark_opener_index < opener_index) {
2649 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2650 MD_MARKCHAIN* chain;
2651
2652 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2653 chain = md_mark_chain(ctx, mark_index: opener_index);
2654 if(chain != NULL) {
2655 md_mark_chain_append(ctx, chain, mark_index: mark_opener_index);
2656 discard_flag = 1;
2657 }
2658 }
2659 }
2660
2661 /* And reset our flags. */
2662 if(discard_flag)
2663 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2664
2665 /* Jump as far as we can over unresolved or non-interesting marks. */
2666 switch(how) {
2667 case MD_ROLLBACK_CROSSING:
2668 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) {
2669 /* If we are closer with opener INSIDE the range, there may
2670 * not be any other crosser inside the subrange. */
2671 mark_index = mark->prev;
2672 break;
2673 }
2674 /* Pass through. */
2675 default:
2676 mark_index--;
2677 break;
2678 }
2679 }
2680}
2681
2682static void
2683md_build_mark_char_map(MD_CTX* ctx)
2684{
2685 memset(s: ctx->mark_char_map, c: 0, n: sizeof(ctx->mark_char_map));
2686
2687 ctx->mark_char_map['\\'] = 1;
2688 ctx->mark_char_map['*'] = 1;
2689 ctx->mark_char_map['_'] = 1;
2690 ctx->mark_char_map['`'] = 1;
2691 ctx->mark_char_map['&'] = 1;
2692 ctx->mark_char_map[';'] = 1;
2693 ctx->mark_char_map['<'] = 1;
2694 ctx->mark_char_map['>'] = 1;
2695 ctx->mark_char_map['['] = 1;
2696 ctx->mark_char_map['!'] = 1;
2697 ctx->mark_char_map[']'] = 1;
2698 ctx->mark_char_map['\0'] = 1;
2699
2700 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2701 ctx->mark_char_map['~'] = 1;
2702
2703 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2704 ctx->mark_char_map['$'] = 1;
2705
2706 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2707 ctx->mark_char_map['@'] = 1;
2708
2709 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2710 ctx->mark_char_map[':'] = 1;
2711
2712 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2713 ctx->mark_char_map['.'] = 1;
2714
2715 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2716 ctx->mark_char_map['|'] = 1;
2717
2718 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2719 int i;
2720
2721 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2722 if(ISWHITESPACE_(i))
2723 ctx->mark_char_map[i] = 1;
2724 }
2725 }
2726}
2727
2728/* We limit code span marks to lower than 32 backticks. This solves the
2729 * pathologic case of too many openers, each of different length: Their
2730 * resolving would be then O(n^2). */
2731#define CODESPAN_MARK_MAXLEN 32
2732
2733static int
2734md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2735 OFF* p_opener_beg, OFF* p_opener_end,
2736 OFF* p_closer_beg, OFF* p_closer_end,
2737 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2738 int* p_reached_paragraph_end)
2739{
2740 OFF opener_beg = beg;
2741 OFF opener_end;
2742 OFF closer_beg;
2743 OFF closer_end;
2744 SZ mark_len;
2745 OFF line_end;
2746 int has_space_after_opener = FALSE;
2747 int has_eol_after_opener = FALSE;
2748 int has_space_before_closer = FALSE;
2749 int has_eol_before_closer = FALSE;
2750 int has_only_space = TRUE;
2751 int line_index = 0;
2752
2753 line_end = lines[0].end;
2754 opener_end = opener_beg;
2755 while(opener_end < line_end && CH(opener_end) == _T('`'))
2756 opener_end++;
2757 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2758 has_eol_after_opener = (opener_end == line_end);
2759
2760 /* The caller needs to know end of the opening mark even if we fail. */
2761 *p_opener_end = opener_end;
2762
2763 mark_len = opener_end - opener_beg;
2764 if(mark_len > CODESPAN_MARK_MAXLEN)
2765 return FALSE;
2766
2767 /* Check whether we already know there is no closer of this length.
2768 * If so, re-scan does no sense. This fixes issue #59. */
2769 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2770 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2771 return FALSE;
2772
2773 closer_beg = opener_end;
2774 closer_end = opener_end;
2775
2776 /* Find closer mark. */
2777 while(TRUE) {
2778 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2779 if(CH(closer_beg) != _T(' '))
2780 has_only_space = FALSE;
2781 closer_beg++;
2782 }
2783 closer_end = closer_beg;
2784 while(closer_end < line_end && CH(closer_end) == _T('`'))
2785 closer_end++;
2786
2787 if(closer_end - closer_beg == mark_len) {
2788 /* Success. */
2789 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2790 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2791 break;
2792 }
2793
2794 if(closer_end - closer_beg > 0) {
2795 /* We have found a back-tick which is not part of the closer. */
2796 has_only_space = FALSE;
2797
2798 /* But if we eventually fail, remember it as a potential closer
2799 * of its own length for future attempts. This mitigates needs for
2800 * rescans. */
2801 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2802 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2803 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2804 }
2805 }
2806
2807 if(closer_end >= line_end) {
2808 line_index++;
2809 if(line_index >= n_lines) {
2810 /* Reached end of the paragraph and still nothing. */
2811 *p_reached_paragraph_end = TRUE;
2812 return FALSE;
2813 }
2814 /* Try on the next line. */
2815 line_end = lines[line_index].end;
2816 closer_beg = lines[line_index].beg;
2817 } else {
2818 closer_beg = closer_end;
2819 }
2820 }
2821
2822 /* If there is a space or a new line both after and before the opener
2823 * (and if the code span is not made of spaces only), consume one initial
2824 * and one trailing space as part of the marks. */
2825 if(!has_only_space &&
2826 (has_space_after_opener || has_eol_after_opener) &&
2827 (has_space_before_closer || has_eol_before_closer))
2828 {
2829 if(has_space_after_opener)
2830 opener_end++;
2831 else
2832 opener_end = lines[1].beg;
2833
2834 if(has_space_before_closer)
2835 closer_beg--;
2836 else {
2837 closer_beg = lines[line_index-1].end;
2838 /* We need to eat the preceding "\r\n" but not any line trailing
2839 * spaces. */
2840 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2841 closer_beg++;
2842 }
2843 }
2844
2845 *p_opener_beg = opener_beg;
2846 *p_opener_end = opener_end;
2847 *p_closer_beg = closer_beg;
2848 *p_closer_end = closer_end;
2849 return TRUE;
2850}
2851
2852static int
2853md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2854{
2855 OFF off = beg+1;
2856
2857 MD_ASSERT(CH(beg) == _T('<'));
2858
2859 /* Check for scheme. */
2860 if(off >= max_end || !ISASCII(off))
2861 return FALSE;
2862 off++;
2863 while(1) {
2864 if(off >= max_end)
2865 return FALSE;
2866 if(off - beg > 32)
2867 return FALSE;
2868 if(CH(off) == _T(':') && off - beg >= 3)
2869 break;
2870 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2871 return FALSE;
2872 off++;
2873 }
2874
2875 /* Check the path after the scheme. */
2876 while(off < max_end && CH(off) != _T('>')) {
2877 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2878 return FALSE;
2879 off++;
2880 }
2881
2882 if(off >= max_end)
2883 return FALSE;
2884
2885 MD_ASSERT(CH(off) == _T('>'));
2886 *p_end = off+1;
2887 return TRUE;
2888}
2889
2890static int
2891md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2892{
2893 OFF off = beg + 1;
2894 int label_len;
2895
2896 MD_ASSERT(CH(beg) == _T('<'));
2897
2898 /* The code should correspond to this regexp:
2899 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2900 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2901 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2902 */
2903
2904 /* Username (before '@'). */
2905 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2906 off++;
2907 if(off <= beg+1)
2908 return FALSE;
2909
2910 /* '@' */
2911 if(off >= max_end || CH(off) != _T('@'))
2912 return FALSE;
2913 off++;
2914
2915 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2916 * characters or '-', but '-' is not allowed as first or last char. */
2917 label_len = 0;
2918 while(off < max_end) {
2919 if(ISALNUM(off))
2920 label_len++;
2921 else if(CH(off) == _T('-') && label_len > 0)
2922 label_len++;
2923 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2924 label_len = 0;
2925 else
2926 break;
2927
2928 if(label_len > 63)
2929 return FALSE;
2930
2931 off++;
2932 }
2933
2934 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2935 return FALSE;
2936
2937 *p_end = off+1;
2938 return TRUE;
2939}
2940
2941static int
2942md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2943{
2944 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2945 *p_missing_mailto = FALSE;
2946 return TRUE;
2947 }
2948
2949 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2950 *p_missing_mailto = TRUE;
2951 return TRUE;
2952 }
2953
2954 return FALSE;
2955}
2956
2957static int
2958md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2959{
2960 int i;
2961 int ret = 0;
2962 MD_MARK* mark;
2963 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2964 int codespan_scanned_till_paragraph_end = FALSE;
2965
2966 for(i = 0; i < n_lines; i++) {
2967 const MD_LINE* line = &lines[i];
2968 OFF off = line->beg;
2969 OFF line_end = line->end;
2970
2971 while(TRUE) {
2972 CHAR ch;
2973
2974#ifdef MD4C_USE_UTF16
2975 /* For UTF-16, mark_char_map[] covers only ASCII. */
2976 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
2977 (ctx->mark_char_map[(unsigned char) CH(off)]))
2978#else
2979 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2980 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
2981#endif
2982
2983 /* Optimization: Use some loop unrolling. */
2984 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
2985 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
2986 off += 4;
2987 while(off < line_end && !IS_MARK_CHAR(off+0))
2988 off++;
2989
2990 if(off >= line_end)
2991 break;
2992
2993 ch = CH(off);
2994
2995 /* A backslash escape.
2996 * It can go beyond line->end as it may involve escaped new
2997 * line to form a hard break. */
2998 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2999 /* Hard-break cannot be on the last line of the block. */
3000 if(!ISNEWLINE(off+1) || i+1 < n_lines)
3001 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3002 off += 2;
3003 continue;
3004 }
3005
3006 /* A potential (string) emphasis start/end. */
3007 if(ch == _T('*') || ch == _T('_')) {
3008 OFF tmp = off+1;
3009 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3010 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3011
3012 while(tmp < line_end && CH(tmp) == ch)
3013 tmp++;
3014
3015 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3016 left_level = 0;
3017 else if(ISUNICODEPUNCTBEFORE(off))
3018 left_level = 1;
3019 else
3020 left_level = 2;
3021
3022 if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
3023 right_level = 0;
3024 else if(ISUNICODEPUNCT(tmp))
3025 right_level = 1;
3026 else
3027 right_level = 2;
3028
3029 /* Intra-word underscore doesn't have special meaning. */
3030 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3031 left_level = 0;
3032 right_level = 0;
3033 }
3034
3035 if(left_level != 0 || right_level != 0) {
3036 unsigned flags = 0;
3037
3038 if(left_level > 0 && left_level >= right_level)
3039 flags |= MD_MARK_POTENTIAL_CLOSER;
3040 if(right_level > 0 && right_level >= left_level)
3041 flags |= MD_MARK_POTENTIAL_OPENER;
3042 if(left_level == 2 && right_level == 2)
3043 flags |= MD_MARK_EMPH_INTRAWORD;
3044
3045 /* For "the rule of three" we need to remember the original
3046 * size of the mark (modulo three), before we potentially
3047 * split the mark when being later resolved partially by some
3048 * shorter closer. */
3049 switch((tmp - off) % 3) {
3050 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3051 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3052 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3053 }
3054
3055 PUSH_MARK(ch, off, tmp, flags);
3056
3057 /* During resolving, multiple asterisks may have to be
3058 * split into independent span start/ends. Consider e.g.
3059 * "**foo* bar*". Therefore we push also some empty dummy
3060 * marks to have enough space for that. */
3061 off++;
3062 while(off < tmp) {
3063 PUSH_MARK('D', off, off, 0);
3064 off++;
3065 }
3066 continue;
3067 }
3068
3069 off = tmp;
3070 continue;
3071 }
3072
3073 /* A potential code span start/end. */
3074 if(ch == _T('`')) {
3075 OFF opener_beg, opener_end;
3076 OFF closer_beg, closer_end;
3077 int is_code_span;
3078
3079 is_code_span = md_is_code_span(ctx, lines: lines + i, n_lines: n_lines - i, beg: off,
3080 p_opener_beg: &opener_beg, p_opener_end: &opener_end, p_closer_beg: &closer_beg, p_closer_end: &closer_end,
3081 last_potential_closers: codespan_last_potential_closers,
3082 p_reached_paragraph_end: &codespan_scanned_till_paragraph_end);
3083 if(is_code_span) {
3084 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3085 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3086 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3087 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3088
3089 off = closer_end;
3090
3091 /* Advance the current line accordingly. */
3092 while(off > line_end) {
3093 i++;
3094 line++;
3095 line_end = line->end;
3096 }
3097 continue;
3098 }
3099
3100 off = opener_end;
3101 continue;
3102 }
3103
3104 /* A potential entity start. */
3105 if(ch == _T('&')) {
3106 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3107 off++;
3108 continue;
3109 }
3110
3111 /* A potential entity end. */
3112 if(ch == _T(';')) {
3113 /* We surely cannot be entity unless the previous mark is '&'. */
3114 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3115 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3116
3117 off++;
3118 continue;
3119 }
3120
3121 /* A potential autolink or raw HTML start/end. */
3122 if(ch == _T('<')) {
3123 int is_autolink;
3124 OFF autolink_end;
3125 int missing_mailto;
3126
3127 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3128 int is_html;
3129 OFF html_end;
3130
3131 /* Given the nature of the raw HTML, we have to recognize
3132 * it here. Doing so later in md_analyze_lt_gt() could
3133 * open can of worms of quadratic complexity. */
3134 is_html = md_is_html_any(ctx, lines: lines + i, n_lines: n_lines - i, beg: off,
3135 max_end: lines[n_lines-1].end, p_end: &html_end);
3136 if(is_html) {
3137 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3138 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3139 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3140 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3141 off = html_end;
3142
3143 /* Advance the current line accordingly. */
3144 while(off > line_end) {
3145 i++;
3146 line++;
3147 line_end = line->end;
3148 }
3149 continue;
3150 }
3151 }
3152
3153 is_autolink = md_is_autolink(ctx, beg: off, max_end: lines[n_lines-1].end,
3154 p_end: &autolink_end, p_missing_mailto: &missing_mailto);
3155 if(is_autolink) {
3156 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3157 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3158 PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3159 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3160 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3161 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3162 off = autolink_end;
3163 continue;
3164 }
3165
3166 off++;
3167 continue;
3168 }
3169
3170 /* A potential link or its part. */
3171 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3172 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3173 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3174 off = tmp;
3175 /* Two dummies to make enough place for data we need if it is
3176 * a link. */
3177 PUSH_MARK('D', off, off, 0);
3178 PUSH_MARK('D', off, off, 0);
3179 continue;
3180 }
3181 if(ch == _T(']')) {
3182 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3183 off++;
3184 continue;
3185 }
3186
3187 /* A potential permissive e-mail autolink. */
3188 if(ch == _T('@')) {
3189 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3190 off + 3 < line->end && ISALNUM(off+1))
3191 {
3192 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3193 /* Push a dummy as a reserve for a closer. */
3194 PUSH_MARK('D', off, off, 0);
3195 }
3196
3197 off++;
3198 continue;
3199 }
3200
3201 /* A potential permissive URL autolink. */
3202 if(ch == _T(':')) {
3203 static struct {
3204 const CHAR* scheme;
3205 SZ scheme_size;
3206 const CHAR* suffix;
3207 SZ suffix_size;
3208 } scheme_map[] = {
3209 /* In the order from the most frequently used, arguably. */
3210 { _T("http"), 4, _T("//"), 2 },
3211 { _T("https"), 5, _T("//"), 2 },
3212 { _T("ftp"), 3, _T("//"), 2 }
3213 };
3214 int scheme_index;
3215
3216 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3217 const CHAR* scheme = scheme_map[scheme_index].scheme;
3218 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3219 const CHAR* suffix = scheme_map[scheme_index].suffix;
3220 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3221
3222 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), s2: scheme, n: scheme_size) &&
3223 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
3224 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), s2: suffix, n: suffix_size))
3225 {
3226 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3227 /* Push a dummy as a reserve for a closer. */
3228 PUSH_MARK('D', off, off, 0);
3229 off += 1 + suffix_size;
3230 continue;
3231 }
3232 }
3233
3234 off++;
3235 continue;
3236 }
3237
3238 /* A potential permissive WWW autolink. */
3239 if(ch == _T('.')) {
3240 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), n: 3) &&
3241 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
3242 off + 1 < line_end)
3243 {
3244 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3245 /* Push a dummy as a reserve for a closer. */
3246 PUSH_MARK('D', off, off, 0);
3247 off++;
3248 continue;
3249 }
3250
3251 off++;
3252 continue;
3253 }
3254
3255 /* A potential table cell boundary or wiki link label delimiter. */
3256 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3257 PUSH_MARK(ch, off, off+1, 0);
3258 off++;
3259 continue;
3260 }
3261
3262 /* A potential strikethrough start/end. */
3263 if(ch == _T('~')) {
3264 OFF tmp = off+1;
3265
3266 while(tmp < line_end && CH(tmp) == _T('~'))
3267 tmp++;
3268
3269 if(tmp - off < 3) {
3270 unsigned flags = 0;
3271
3272 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp))
3273 flags |= MD_MARK_POTENTIAL_OPENER;
3274 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3275 flags |= MD_MARK_POTENTIAL_CLOSER;
3276 if(flags != 0)
3277 PUSH_MARK(ch, off, tmp, flags);
3278 }
3279
3280 off = tmp;
3281 continue;
3282 }
3283
3284 /* A potential equation start/end */
3285 if(ch == _T('$')) {
3286 /* We can have at most two consecutive $ signs,
3287 * where two dollar signs signify a display equation. */
3288 OFF tmp = off+1;
3289
3290 while(tmp < line_end && CH(tmp) == _T('$'))
3291 tmp++;
3292
3293 if (tmp - off <= 2)
3294 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3295 off = tmp;
3296 continue;
3297 }
3298
3299 /* Turn non-trivial whitespace into single space. */
3300 if(ISWHITESPACE_(ch)) {
3301 OFF tmp = off+1;
3302
3303 while(tmp < line_end && ISWHITESPACE(tmp))
3304 tmp++;
3305
3306 if(tmp - off > 1 || ch != _T(' '))
3307 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3308
3309 off = tmp;
3310 continue;
3311 }
3312
3313 /* NULL character. */
3314 if(ch == _T('\0')) {
3315 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3316 off++;
3317 continue;
3318 }
3319
3320 off++;
3321 }
3322 }
3323
3324 /* Add a dummy mark at the end of the mark vector to simplify
3325 * process_inlines(). */
3326 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3327
3328abort:
3329 return ret;
3330}
3331
3332static void
3333md_analyze_bracket(MD_CTX* ctx, int mark_index)
3334{
3335 /* We cannot really resolve links here as for that we would need
3336 * more context. E.g. a following pair of brackets (reference link),
3337 * or enclosing pair of brackets (if the inner is the link, the outer
3338 * one cannot be.)
3339 *
3340 * Therefore we here only construct a list of resolved '[' ']' pairs
3341 * ordered by position of the closer. This allows ur to analyze what is
3342 * or is not link in the right order, from inside to outside in case
3343 * of nested brackets.
3344 *
3345 * The resolving itself is deferred into md_resolve_links().
3346 */
3347
3348 MD_MARK* mark = &ctx->marks[mark_index];
3349
3350 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3351 md_mark_chain_append(ctx, chain: &BRACKET_OPENERS, mark_index);
3352 return;
3353 }
3354
3355 if(BRACKET_OPENERS.tail >= 0) {
3356 /* Pop the opener from the chain. */
3357 int opener_index = BRACKET_OPENERS.tail;
3358 MD_MARK* opener = &ctx->marks[opener_index];
3359 if(opener->prev >= 0)
3360 ctx->marks[opener->prev].next = -1;
3361 else
3362 BRACKET_OPENERS.head = -1;
3363 BRACKET_OPENERS.tail = opener->prev;
3364
3365 /* Interconnect the opener and closer. */
3366 opener->next = mark_index;
3367 mark->prev = opener_index;
3368
3369 /* Add the pair into chain of potential links for md_resolve_links().
3370 * Note we misuse opener->prev for this as opener->next points to its
3371 * closer. */
3372 if(ctx->unresolved_link_tail >= 0)
3373 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3374 else
3375 ctx->unresolved_link_head = opener_index;
3376 ctx->unresolved_link_tail = opener_index;
3377 opener->prev = -1;
3378 }
3379}
3380
3381/* Forward declaration. */
3382static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3383 int mark_beg, int mark_end);
3384
3385static int
3386md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3387{
3388 int opener_index = ctx->unresolved_link_head;
3389 OFF last_link_beg = 0;
3390 OFF last_link_end = 0;
3391 OFF last_img_beg = 0;
3392 OFF last_img_end = 0;
3393
3394 while(opener_index >= 0) {
3395 MD_MARK* opener = &ctx->marks[opener_index];
3396 int closer_index = opener->next;
3397 MD_MARK* closer = &ctx->marks[closer_index];
3398 int next_index = opener->prev;
3399 MD_MARK* next_opener;
3400 MD_MARK* next_closer;
3401 MD_LINK_ATTR attr;
3402 int is_link = FALSE;
3403
3404 if(next_index >= 0) {
3405 next_opener = &ctx->marks[next_index];
3406 next_closer = &ctx->marks[next_opener->next];
3407 } else {
3408 next_opener = NULL;
3409 next_closer = NULL;
3410 }
3411
3412 /* If nested ("[ [ ] ]"), we need to make sure that:
3413 * - The outer does not end inside of (...) belonging to the inner.
3414 * - The outer cannot be link if the inner is link (i.e. not image).
3415 *
3416 * (Note we here analyze from inner to outer as the marks are ordered
3417 * by closer->beg.)
3418 */
3419 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3420 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3421 (opener->beg < last_link_end && opener->ch == '['))
3422 {
3423 opener_index = next_index;
3424 continue;
3425 }
3426
3427 /* Recognize and resolve wiki links.
3428 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3429 */
3430 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3431 (opener->end - opener->beg == 1) && /* not image */
3432 next_opener != NULL && /* double '[' opener */
3433 next_opener->ch == '[' &&
3434 (next_opener->beg == opener->beg - 1) &&
3435 (next_opener->end - next_opener->beg == 1) &&
3436 next_closer != NULL && /* double ']' closer */
3437 next_closer->ch == ']' &&
3438 (next_closer->beg == closer->beg + 1) &&
3439 (next_closer->end - next_closer->beg == 1))
3440 {
3441 MD_MARK* delim = NULL;
3442 int delim_index;
3443 OFF dest_beg, dest_end;
3444
3445 is_link = TRUE;
3446
3447 /* We don't allow destination to be longer than 100 characters.
3448 * Lets scan to see whether there is '|'. (If not then the whole
3449 * wiki-link has to be below the 100 characters.) */
3450 delim_index = opener_index + 1;
3451 while(delim_index < closer_index) {
3452 MD_MARK* m = &ctx->marks[delim_index];
3453 if(m->ch == '|') {
3454 delim = m;
3455 break;
3456 }
3457 if(m->ch != 'D' && m->beg - opener->end > 100)
3458 break;
3459 delim_index++;
3460 }
3461 dest_beg = opener->end;
3462 dest_end = (delim != NULL) ? delim->beg : closer->beg;
3463 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3464 is_link = FALSE;
3465
3466 /* There may not be any new line in the destination. */
3467 if(is_link) {
3468 OFF off;
3469 for(off = dest_beg; off < dest_end; off++) {
3470 if(ISNEWLINE(off)) {
3471 is_link = FALSE;
3472 break;
3473 }
3474 }
3475 }
3476
3477 if(is_link) {
3478 if(delim != NULL) {
3479 if(delim->end < closer->beg) {
3480 opener->end = delim->beg;
3481 } else {
3482 /* The pipe is just before the closer: [[foo|]] */
3483 closer->beg = delim->beg;
3484 delim = NULL;
3485 }
3486 }
3487
3488 opener->beg = next_opener->beg;
3489 opener->next = closer_index;
3490 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3491
3492 closer->end = next_closer->end;
3493 closer->prev = opener_index;
3494 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3495
3496 last_link_beg = opener->beg;
3497 last_link_end = closer->end;
3498
3499 if(delim != NULL) {
3500 delim->flags |= MD_MARK_RESOLVED;
3501 md_rollback(ctx, opener_index, closer_index: delim_index, MD_ROLLBACK_ALL);
3502 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: opener_index+1, mark_end: closer_index);
3503 } else {
3504 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3505 }
3506
3507 opener_index = next_opener->prev;
3508 continue;
3509 }
3510 }
3511
3512 if(next_opener != NULL && next_opener->beg == closer->end) {
3513 if(next_closer->beg > closer->end + 1) {
3514 /* Might be full reference link. */
3515 is_link = md_is_link_reference(ctx, lines, n_lines, beg: next_opener->beg, end: next_closer->end, attr: &attr);
3516 } else {
3517 /* Might be shortcut reference link. */
3518 is_link = md_is_link_reference(ctx, lines, n_lines, beg: opener->beg, end: closer->end, attr: &attr);
3519 }
3520
3521 if(is_link < 0)
3522 return -1;
3523
3524 if(is_link) {
3525 /* Eat the 2nd "[...]". */
3526 closer->end = next_closer->end;
3527 }
3528 } else {
3529 if(closer->end < ctx->size && CH(closer->end) == _T('(')) {
3530 /* Might be inline link. */
3531 OFF inline_link_end = UINT_MAX;
3532
3533 is_link = md_is_inline_link_spec(ctx, lines, n_lines, beg: closer->end, p_end: &inline_link_end, attr: &attr);
3534 if(is_link < 0)
3535 return -1;
3536
3537 /* Check the closing ')' is not inside an already resolved range
3538 * (i.e. a range with a higher priority), e.g. a code span. */
3539 if(is_link) {
3540 int i = closer_index + 1;
3541
3542 while(i < ctx->n_marks) {
3543 MD_MARK* mark = &ctx->marks[i];
3544
3545 if(mark->beg >= inline_link_end)
3546 break;
3547 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3548 if(ctx->marks[mark->next].beg >= inline_link_end) {
3549 /* Cancel the link status. */
3550 if(attr.title_needs_free)
3551 free(ptr: attr.title);
3552 is_link = FALSE;
3553 break;
3554 }
3555
3556 i = mark->next + 1;
3557 } else {
3558 i++;
3559 }
3560 }
3561 }
3562
3563 if(is_link) {
3564 /* Eat the "(...)" */
3565 closer->end = inline_link_end;
3566 }
3567 }
3568
3569 if(!is_link) {
3570 /* Might be collapsed reference link. */
3571 is_link = md_is_link_reference(ctx, lines, n_lines, beg: opener->beg, end: closer->end, attr: &attr);
3572 if(is_link < 0)
3573 return -1;
3574 }
3575 }
3576
3577 if(is_link) {
3578 /* Resolve the brackets as a link. */
3579 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3580 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3581
3582 /* If it is a link, we store the destination and title in the two
3583 * dummy marks after the opener. */
3584 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3585 ctx->marks[opener_index+1].beg = attr.dest_beg;
3586 ctx->marks[opener_index+1].end = attr.dest_end;
3587
3588 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3589 md_mark_store_ptr(ctx, mark_index: opener_index+2, ptr: attr.title);
3590 /* The title might or might not have been allocated for us. */
3591 if(attr.title_needs_free)
3592 md_mark_chain_append(ctx, chain: &PTR_CHAIN, mark_index: opener_index+2);
3593 ctx->marks[opener_index+2].prev = attr.title_size;
3594
3595 if(opener->ch == '[') {
3596 last_link_beg = opener->beg;
3597 last_link_end = closer->end;
3598 } else {
3599 last_img_beg = opener->beg;
3600 last_img_end = closer->end;
3601 }
3602
3603 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: opener_index+1, mark_end: closer_index);
3604
3605 /* If the link text is formed by nothing but permissive autolink,
3606 * suppress the autolink.
3607 * See https://github.com/mity/md4c/issues/152 for more info. */
3608 if(ctx->parser.flags & MD_FLAG_PERMISSIVEAUTOLINKS) {
3609 MD_MARK* first_nested;
3610 MD_MARK* last_nested;
3611
3612 first_nested = opener + 1;
3613 while(first_nested->ch == _T('D') && first_nested < closer)
3614 first_nested++;
3615
3616 last_nested = closer - 1;
3617 while(first_nested->ch == _T('D') && last_nested > opener)
3618 last_nested--;
3619
3620 if((first_nested->flags & MD_MARK_RESOLVED) &&
3621 first_nested->beg == opener->end &&
3622 ISANYOF_(first_nested->ch, _T("@:.")) &&
3623 first_nested->next == (last_nested - ctx->marks) &&
3624 last_nested->end == closer->beg)
3625 {
3626 first_nested->ch = _T('D');
3627 first_nested->flags &= ~MD_MARK_RESOLVED;
3628 last_nested->ch = _T('D');
3629 last_nested->flags &= ~MD_MARK_RESOLVED;
3630 }
3631 }
3632 }
3633
3634 opener_index = next_index;
3635 }
3636
3637 return 0;
3638}
3639
3640/* Analyze whether the mark '&' starts a HTML entity.
3641 * If so, update its flags as well as flags of corresponding closer ';'. */
3642static void
3643md_analyze_entity(MD_CTX* ctx, int mark_index)
3644{
3645 MD_MARK* opener = &ctx->marks[mark_index];
3646 MD_MARK* closer;
3647 OFF off;
3648
3649 /* Cannot be entity if there is no closer as the next mark.
3650 * (Any other mark between would mean strange character which cannot be
3651 * part of the entity.
3652 *
3653 * So we can do all the work on '&' and do not call this later for the
3654 * closing mark ';'.
3655 */
3656 if(mark_index + 1 >= ctx->n_marks)
3657 return;
3658 closer = &ctx->marks[mark_index+1];
3659 if(closer->ch != ';')
3660 return;
3661
3662 if(md_is_entity(ctx, beg: opener->beg, max_end: closer->end, p_end: &off)) {
3663 MD_ASSERT(off == closer->end);
3664
3665 md_resolve_range(ctx, NULL, opener_index: mark_index, closer_index: mark_index+1);
3666 opener->end = closer->end;
3667 }
3668}
3669
3670static void
3671md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3672{
3673 MD_MARK* mark = &ctx->marks[mark_index];
3674 mark->flags |= MD_MARK_RESOLVED;
3675
3676 md_mark_chain_append(ctx, chain: &TABLECELLBOUNDARIES, mark_index);
3677 ctx->n_table_cell_boundaries++;
3678}
3679
3680/* Split a longer mark into two. The new mark takes the given count of
3681 * characters. May only be called if an adequate number of dummy 'D' marks
3682 * follows.
3683 */
3684static int
3685md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3686{
3687 MD_MARK* mark = &ctx->marks[mark_index];
3688 int new_mark_index = mark_index + (mark->end - mark->beg - n);
3689 MD_MARK* dummy = &ctx->marks[new_mark_index];
3690
3691 MD_ASSERT(mark->end - mark->beg > n);
3692 MD_ASSERT(dummy->ch == 'D');
3693
3694 memcpy(dest: dummy, src: mark, n: sizeof(MD_MARK));
3695 mark->end -= n;
3696 dummy->beg = mark->end;
3697
3698 return new_mark_index;
3699}
3700
3701static void
3702md_analyze_emph(MD_CTX* ctx, int mark_index)
3703{
3704 MD_MARK* mark = &ctx->marks[mark_index];
3705 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3706
3707 /* If we can be a closer, try to resolve with the preceding opener. */
3708 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3709 MD_MARK* opener = NULL;
3710 int opener_index;
3711
3712 if(mark->ch == _T('*')) {
3713 MD_MARKCHAIN* opener_chains[6];
3714 int i, n_opener_chains;
3715 unsigned flags = mark->flags;
3716
3717 /* Apply the "rule of three". */
3718 n_opener_chains = 0;
3719 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3720 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3721 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3722 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3723 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3724 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3725 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3726 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3727 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3728 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3729
3730 /* Opener is the most recent mark from the allowed chains. */
3731 for(i = 0; i < n_opener_chains; i++) {
3732 if(opener_chains[i]->tail >= 0) {
3733 int tmp_index = opener_chains[i]->tail;
3734 MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3735 if(opener == NULL || tmp_mark->end > opener->end) {
3736 opener_index = tmp_index;
3737 opener = tmp_mark;
3738 }
3739 }
3740 }
3741 } else {
3742 /* Simple emph. mark */
3743 if(chain->tail >= 0) {
3744 opener_index = chain->tail;
3745 opener = &ctx->marks[opener_index];
3746 }
3747 }
3748
3749 /* Resolve, if we have found matching opener. */
3750 if(opener != NULL) {
3751 SZ opener_size = opener->end - opener->beg;
3752 SZ closer_size = mark->end - mark->beg;
3753 MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, mark_index: opener_index);
3754
3755 if(opener_size > closer_size) {
3756 opener_index = md_split_emph_mark(ctx, mark_index: opener_index, n: closer_size);
3757 md_mark_chain_append(ctx, chain: opener_chain, mark_index: opener_index);
3758 } else if(opener_size < closer_size) {
3759 md_split_emph_mark(ctx, mark_index, n: closer_size - opener_size);
3760 }
3761
3762 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_CROSSING);
3763 md_resolve_range(ctx, chain: opener_chain, opener_index, closer_index: mark_index);
3764 return;
3765 }
3766 }
3767
3768 /* If we could not resolve as closer, we may be yet be an opener. */
3769 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3770 md_mark_chain_append(ctx, chain, mark_index);
3771}
3772
3773static void
3774md_analyze_tilde(MD_CTX* ctx, int mark_index)
3775{
3776 MD_MARK* mark = &ctx->marks[mark_index];
3777 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3778
3779 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3780 * only tildes sequences of length 1 and 2, and the length of the opener
3781 * and closer has to match. */
3782
3783 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) {
3784 int opener_index = chain->head;
3785
3786 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_CROSSING);
3787 md_resolve_range(ctx, chain, opener_index, closer_index: mark_index);
3788 return;
3789 }
3790
3791 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3792 md_mark_chain_append(ctx, chain, mark_index);
3793}
3794
3795static void
3796md_analyze_dollar(MD_CTX* ctx, int mark_index)
3797{
3798 /* This should mimic the way inline equations work in LaTeX, so there
3799 * can only ever be one item in the chain (i.e. the dollars can't be
3800 * nested). This is basically the same as the md_analyze_tilde function,
3801 * except that we require matching openers and closers to be of the same
3802 * length.
3803 *
3804 * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3805 if(DOLLAR_OPENERS.head >= 0) {
3806 /* If the potential closer has a non-matching number of $, discard */
3807 MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3808 MD_MARK* close = &ctx->marks[mark_index];
3809
3810 int opener_index = DOLLAR_OPENERS.head;
3811 md_rollback(ctx, opener_index, closer_index: mark_index, MD_ROLLBACK_ALL);
3812 if (open->end - open->beg == close->end - close->beg) {
3813 /* We are the matching closer */
3814 md_resolve_range(ctx, chain: &DOLLAR_OPENERS, opener_index, closer_index: mark_index);
3815 } else {
3816 /* We don't match the opener, so discard old opener and insert as opener */
3817 md_mark_chain_append(ctx, chain: &DOLLAR_OPENERS, mark_index);
3818 }
3819 } else {
3820 /* No unmatched openers, so we are opener */
3821 md_mark_chain_append(ctx, chain: &DOLLAR_OPENERS, mark_index);
3822 }
3823}
3824
3825static void
3826md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3827{
3828 MD_MARK* opener = &ctx->marks[mark_index];
3829 int closer_index = mark_index + 1;
3830 MD_MARK* closer = &ctx->marks[closer_index];
3831 MD_MARK* next_resolved_mark;
3832 OFF off = opener->end;
3833 int n_dots = FALSE;
3834 int has_underscore_in_last_seg = FALSE;
3835 int has_underscore_in_next_to_last_seg = FALSE;
3836 int n_opened_parenthesis = 0;
3837
3838 /* Check for domain. */
3839 while(off < ctx->size) {
3840 if(ISALNUM(off) || CH(off) == _T('-')) {
3841 off++;
3842 } else if(CH(off) == _T('.')) {
3843 /* We must see at least one period. */
3844 n_dots++;
3845 has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3846 has_underscore_in_last_seg = FALSE;
3847 off++;
3848 } else if(CH(off) == _T('_')) {
3849 /* No underscore may be present in the last two domain segments. */
3850 has_underscore_in_last_seg = TRUE;
3851 off++;
3852 } else {
3853 break;
3854 }
3855 }
3856 if(off > opener->end && CH(off-1) == _T('.')) {
3857 off--;
3858 n_dots--;
3859 }
3860 if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3861 return;
3862
3863 /* Check for path. */
3864 next_resolved_mark = closer + 1;
3865 while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3866 next_resolved_mark++;
3867 while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) {
3868 /* Parenthesis must be balanced. */
3869 if(CH(off) == _T('(')) {
3870 n_opened_parenthesis++;
3871 } else if(CH(off) == _T(')')) {
3872 if(n_opened_parenthesis > 0)
3873 n_opened_parenthesis--;
3874 else
3875 break;
3876 }
3877
3878 off++;
3879 }
3880 /* These cannot be last char In such case they are more likely normal
3881 * punctuation. */
3882 if(ISANYOF(off-1, _T("?!.,:*_~")))
3883 off--;
3884
3885 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3886 * length so all the contents becomes the link text. */
3887 MD_ASSERT(closer->ch == 'D');
3888 opener->end = opener->beg;
3889 closer->ch = opener->ch;
3890 closer->beg = off;
3891 closer->end = off;
3892 md_resolve_range(ctx, NULL, opener_index: mark_index, closer_index);
3893}
3894
3895/* The permissive autolinks do not have to be enclosed in '<' '>' but we
3896 * instead impose stricter rules what is understood as an e-mail address
3897 * here. Actually any non-alphanumeric characters with exception of '.'
3898 * are prohibited both in username and after '@'. */
3899static void
3900md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3901{
3902 MD_MARK* opener = &ctx->marks[mark_index];
3903 int closer_index;
3904 MD_MARK* closer;
3905 OFF beg = opener->beg;
3906 OFF end = opener->end;
3907 int dot_count = 0;
3908
3909 MD_ASSERT(CH(beg) == _T('@'));
3910
3911 /* Scan for name before '@'. */
3912 while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3913 beg--;
3914
3915 /* Scan for domain after '@'. */
3916 while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3917 if(CH(end) == _T('.'))
3918 dot_count++;
3919 end++;
3920 }
3921 if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */
3922 dot_count--;
3923 end--;
3924 }
3925 else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3926 return;
3927 if(CH(end-1) == _T('@') || dot_count == 0)
3928 return;
3929
3930 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3931 * length so all the contents becomes the link text. */
3932 closer_index = mark_index + 1;
3933 closer = &ctx->marks[closer_index];
3934 MD_ASSERT(closer->ch == 'D');
3935
3936 opener->beg = beg;
3937 opener->end = beg;
3938 closer->ch = opener->ch;
3939 closer->beg = end;
3940 closer->end = end;
3941 md_resolve_range(ctx, NULL, opener_index: mark_index, closer_index);
3942}
3943
3944static inline void
3945md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3946 int mark_beg, int mark_end, const CHAR* mark_chars)
3947{
3948 int i = mark_beg;
3949
3950 while(i < mark_end) {
3951 MD_MARK* mark = &ctx->marks[i];
3952
3953 /* Skip resolved spans. */
3954 if(mark->flags & MD_MARK_RESOLVED) {
3955 if(mark->flags & MD_MARK_OPENER) {
3956 MD_ASSERT(i < mark->next);
3957 i = mark->next + 1;
3958 } else {
3959 i++;
3960 }
3961 continue;
3962 }
3963
3964 /* Skip marks we do not want to deal with. */
3965 if(!ISANYOF_(mark->ch, mark_chars)) {
3966 i++;
3967 continue;
3968 }
3969
3970 /* Analyze the mark. */
3971 switch(mark->ch) {
3972 case '[': /* Pass through. */
3973 case '!': /* Pass through. */
3974 case ']': md_analyze_bracket(ctx, mark_index: i); break;
3975 case '&': md_analyze_entity(ctx, mark_index: i); break;
3976 case '|': md_analyze_table_cell_boundary(ctx, mark_index: i); break;
3977 case '_': /* Pass through. */
3978 case '*': md_analyze_emph(ctx, mark_index: i); break;
3979 case '~': md_analyze_tilde(ctx, mark_index: i); break;
3980 case '$': md_analyze_dollar(ctx, mark_index: i); break;
3981 case '.': /* Pass through. */
3982 case ':': md_analyze_permissive_url_autolink(ctx, mark_index: i); break;
3983 case '@': md_analyze_permissive_email_autolink(ctx, mark_index: i); break;
3984 }
3985
3986 i++;
3987 }
3988}
3989
3990/* Analyze marks (build ctx->marks). */
3991static int
3992md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
3993{
3994 int ret;
3995
3996 /* Reset the previously collected stack of marks. */
3997 ctx->n_marks = 0;
3998
3999 /* Collect all marks. */
4000 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
4001
4002 /* We analyze marks in few groups to handle their precedence. */
4003 /* (1) Entities; code spans; autolinks; raw HTML. */
4004 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("&"));
4005
4006 /* (2) Links. */
4007 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("[]!"));
4008 MD_CHECK(md_resolve_links(ctx, lines, n_lines));
4009 BRACKET_OPENERS.head = -1;
4010 BRACKET_OPENERS.tail = -1;
4011 ctx->unresolved_link_head = -1;
4012 ctx->unresolved_link_tail = -1;
4013
4014 if(table_mode) {
4015 /* (3) Analyze table cell boundaries.
4016 * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
4017 * not after, because caller may need it. */
4018 MD_ASSERT(n_lines == 1);
4019 TABLECELLBOUNDARIES.head = -1;
4020 TABLECELLBOUNDARIES.tail = -1;
4021 ctx->n_table_cell_boundaries = 0;
4022 md_analyze_marks(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks, _T("|"));
4023 return ret;
4024 }
4025
4026 /* (4) Emphasis and strong emphasis; permissive autolinks. */
4027 md_analyze_link_contents(ctx, lines, n_lines, mark_beg: 0, mark_end: ctx->n_marks);
4028
4029abort:
4030 return ret;
4031}
4032
4033static void
4034md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4035 int mark_beg, int mark_end)
4036{
4037 int i;
4038
4039 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4040
4041 for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4042 ctx->mark_chains[i].head = -1;
4043 ctx->mark_chains[i].tail = -1;
4044 }
4045}
4046
4047static int
4048md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4049 const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4050 const CHAR* title, SZ title_size)
4051{
4052 MD_ATTRIBUTE_BUILD href_build = { 0 };
4053 MD_ATTRIBUTE_BUILD title_build = { 0 };
4054 MD_SPAN_A_DETAIL det;
4055 int ret = 0;
4056
4057 /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4058 * MD_SPAN_IMG_DETAIL are binary-compatible. */
4059 memset(s: &det, c: 0, n: sizeof(MD_SPAN_A_DETAIL));
4060 MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4061 (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4062 &det.href, &href_build));
4063 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4064
4065 if(enter)
4066 MD_ENTER_SPAN(type, &det);
4067 else
4068 MD_LEAVE_SPAN(type, &det);
4069
4070abort:
4071 md_free_attribute(ctx, build: &href_build);
4072 md_free_attribute(ctx, build: &title_build);
4073 return ret;
4074}
4075
4076static int
4077md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4078{
4079 MD_ATTRIBUTE_BUILD target_build = { 0 };
4080 MD_SPAN_WIKILINK_DETAIL det;
4081 int ret = 0;
4082
4083 memset(s: &det, c: 0, n: sizeof(MD_SPAN_WIKILINK_DETAIL));
4084 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4085
4086 if (enter)
4087 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4088 else
4089 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4090
4091abort:
4092 md_free_attribute(ctx, build: &target_build);
4093 return ret;
4094}
4095
4096
4097/* Render the output, accordingly to the analyzed ctx->marks. */
4098static int
4099md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4100{
4101 MD_TEXTTYPE text_type;
4102 const MD_LINE* line = lines;
4103 MD_MARK* prev_mark = NULL;
4104 MD_MARK* mark;
4105 OFF off = lines[0].beg;
4106 OFF end = lines[n_lines-1].end;
4107 int enforce_hardbreak = 0;
4108 int ret = 0;
4109
4110 /* Find first resolved mark. Note there is always at least one resolved
4111 * mark, the dummy last one after the end of the latest line we actually
4112 * never really reach. This saves us of a lot of special checks and cases
4113 * in this function. */
4114 mark = ctx->marks;
4115 while(!(mark->flags & MD_MARK_RESOLVED))
4116 mark++;
4117
4118 text_type = MD_TEXT_NORMAL;
4119
4120 while(1) {
4121 /* Process the text up to the next mark or end-of-line. */
4122 OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4123 if(tmp > off) {
4124 MD_TEXT(text_type, STR(off), tmp - off);
4125 off = tmp;
4126 }
4127
4128 /* If reached the mark, process it and move to next one. */
4129 if(off >= mark->beg) {
4130 switch(mark->ch) {
4131 case '\\': /* Backslash escape. */
4132 if(ISNEWLINE(mark->beg+1))
4133 enforce_hardbreak = 1;
4134 else
4135 MD_TEXT(text_type, STR(mark->beg+1), 1);
4136 break;
4137
4138 case ' ': /* Non-trivial space. */
4139 MD_TEXT(text_type, _T(" "), 1);
4140 break;
4141
4142 case '`': /* Code span. */
4143 if(mark->flags & MD_MARK_OPENER) {
4144 MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4145 text_type = MD_TEXT_CODE;
4146 } else {
4147 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4148 text_type = MD_TEXT_NORMAL;
4149 }
4150 break;
4151
4152 case '_': /* Underline (or emphasis if we fall through). */
4153 if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4154 if(mark->flags & MD_MARK_OPENER) {
4155 while(off < mark->end) {
4156 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4157 off++;
4158 }
4159 } else {
4160 while(off < mark->end) {
4161 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4162 off++;
4163 }
4164 }
4165 break;
4166 }
4167 /* Fall though. */
4168
4169 case '*': /* Emphasis, strong emphasis. */
4170 if(mark->flags & MD_MARK_OPENER) {
4171 if((mark->end - off) % 2) {
4172 MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4173 off++;
4174 }
4175 while(off + 1 < mark->end) {
4176 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4177 off += 2;
4178 }
4179 } else {
4180 while(off + 1 < mark->end) {
4181 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4182 off += 2;
4183 }
4184 if((mark->end - off) % 2) {
4185 MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4186 off++;
4187 }
4188 }
4189 break;
4190
4191 case '~':
4192 if(mark->flags & MD_MARK_OPENER)
4193 MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4194 else
4195 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4196 break;
4197
4198 case '$':
4199 if(mark->flags & MD_MARK_OPENER) {
4200 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4201 text_type = MD_TEXT_LATEXMATH;
4202 } else {
4203 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4204 text_type = MD_TEXT_NORMAL;
4205 }
4206 break;
4207
4208 case '[': /* Link, wiki link, image. */
4209 case '!':
4210 case ']':
4211 {
4212 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4213 const MD_MARK* closer = &ctx->marks[opener->next];
4214 const MD_MARK* dest_mark;
4215 const MD_MARK* title_mark;
4216
4217 if ((opener->ch == '[' && closer->ch == ']') &&
4218 opener->end - opener->beg >= 2 &&
4219 closer->end - closer->beg >= 2)
4220 {
4221 int has_label = (opener->end - opener->beg > 2);
4222 SZ target_sz;
4223
4224 if(has_label)
4225 target_sz = opener->end - (opener->beg+2);
4226 else
4227 target_sz = closer->beg - opener->end;
4228
4229 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4230 has_label ? STR(opener->beg+2) : STR(opener->end),
4231 target_sz));
4232
4233 break;
4234 }
4235
4236 dest_mark = opener+1;
4237 MD_ASSERT(dest_mark->ch == 'D');
4238 title_mark = opener+2;
4239 MD_ASSERT(title_mark->ch == 'D');
4240
4241 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4242 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4243 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4244 md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev));
4245
4246 /* link/image closer may span multiple lines. */
4247 if(mark->ch == ']') {
4248 while(mark->end > line->end)
4249 line++;
4250 }
4251
4252 break;
4253 }
4254
4255 case '<':
4256 case '>': /* Autolink or raw HTML. */
4257 if(!(mark->flags & MD_MARK_AUTOLINK)) {
4258 /* Raw HTML. */
4259 if(mark->flags & MD_MARK_OPENER)
4260 text_type = MD_TEXT_HTML;
4261 else
4262 text_type = MD_TEXT_NORMAL;
4263 break;
4264 }
4265 /* Pass through, if auto-link. */
4266
4267 case '@': /* Permissive e-mail autolink. */
4268 case ':': /* Permissive URL autolink. */
4269 case '.': /* Permissive WWW autolink. */
4270 {
4271 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4272 MD_MARK* closer = &ctx->marks[opener->next];
4273 const CHAR* dest = STR(opener->end);
4274 SZ dest_size = closer->beg - opener->end;
4275
4276 /* For permissive auto-links we do not know closer mark
4277 * position at the time of md_collect_marks(), therefore
4278 * it can be out-of-order in ctx->marks[].
4279 *
4280 * With this flag, we make sure that we output the closer
4281 * only if we processed the opener. */
4282 if(mark->flags & MD_MARK_OPENER)
4283 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4284
4285 if(opener->ch == '@' || opener->ch == '.') {
4286 dest_size += 7;
4287 MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4288 memcpy(dest: ctx->buffer,
4289 src: (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4290 n: 7 * sizeof(CHAR));
4291 memcpy(dest: ctx->buffer + 7, src: dest, n: (dest_size-7) * sizeof(CHAR));
4292 dest = ctx->buffer;
4293 }
4294
4295 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4296 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4297 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4298 break;
4299 }
4300
4301 case '&': /* Entity. */
4302 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4303 break;
4304
4305 case '\0':
4306 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4307 break;
4308
4309 case 127:
4310 goto abort;
4311 }
4312
4313 off = mark->end;
4314
4315 /* Move to next resolved mark. */
4316 prev_mark = mark;
4317 mark++;
4318 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)
4319 mark++;
4320 }
4321
4322 /* If reached end of line, move to next one. */
4323 if(off >= line->end) {
4324 /* If it is the last line, we are done. */
4325 if(off >= end)
4326 break;
4327
4328 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4329 OFF tmp;
4330
4331 MD_ASSERT(prev_mark != NULL);
4332 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER));
4333 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER));
4334
4335 /* Inside a code span, trailing line whitespace has to be
4336 * outputted. */
4337 tmp = off;
4338 while(off < ctx->size && ISBLANK(off))
4339 off++;
4340 if(off > tmp)
4341 MD_TEXT(text_type, STR(tmp), off-tmp);
4342
4343 /* and new lines are transformed into single spaces. */
4344 if(prev_mark->end < off && off < mark->beg)
4345 MD_TEXT(text_type, _T(" "), 1);
4346 } else if(text_type == MD_TEXT_HTML) {
4347 /* Inside raw HTML, we output the new line verbatim, including
4348 * any trailing spaces. */
4349 OFF tmp = off;
4350
4351 while(tmp < end && ISBLANK(tmp))
4352 tmp++;
4353 if(tmp > off)
4354 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4355 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4356 } else {
4357 /* Output soft or hard line break. */
4358 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4359
4360 if(text_type == MD_TEXT_NORMAL) {
4361 if(enforce_hardbreak)
4362 break_type = MD_TEXT_BR;
4363 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4364 break_type = MD_TEXT_BR;
4365 }
4366
4367 MD_TEXT(break_type, _T("\n"), 1);
4368 }
4369
4370 /* Move to the next line. */
4371 line++;
4372 off = line->beg;
4373
4374 enforce_hardbreak = 0;
4375 }
4376 }
4377
4378abort:
4379 return ret;
4380}
4381
4382
4383/***************************
4384 *** Processing Tables ***
4385 ***************************/
4386
4387static void
4388md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4389{
4390 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4391 OFF off = beg;
4392
4393 while(n_align > 0) {
4394 int index = 0; /* index into align_map[] */
4395
4396 while(CH(off) != _T('-'))
4397 off++;
4398 if(off > beg && CH(off-1) == _T(':'))
4399 index |= 1;
4400 while(off < end && CH(off) == _T('-'))
4401 off++;
4402 if(off < end && CH(off) == _T(':'))
4403 index |= 2;
4404
4405 *align = align_map[index];
4406 align++;
4407 n_align--;
4408 }
4409
4410}
4411
4412/* Forward declaration. */
4413static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4414
4415static int
4416md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4417{
4418 MD_LINE line;
4419 MD_BLOCK_TD_DETAIL det;
4420 int ret = 0;
4421
4422 while(beg < end && ISWHITESPACE(beg))
4423 beg++;
4424 while(end > beg && ISWHITESPACE(end-1))
4425 end--;
4426
4427 det.align = align;
4428 line.beg = beg;
4429 line.end = end;
4430
4431 MD_ENTER_BLOCK(cell_type, &det);
4432 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4433 MD_LEAVE_BLOCK(cell_type, &det);
4434
4435abort:
4436 return ret;
4437}
4438
4439static int
4440md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4441 const MD_ALIGN* align, int col_count)
4442{
4443 MD_LINE line;
4444 OFF* pipe_offs = NULL;
4445 int i, j, k, n;
4446 int ret = 0;
4447
4448 line.beg = beg;
4449 line.end = end;
4450
4451 /* Break the line into table cells by identifying pipe characters who
4452 * form the cell boundary. */
4453 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4454
4455 /* We have to remember the cell boundaries in local buffer because
4456 * ctx->marks[] shall be reused during cell contents processing. */
4457 n = ctx->n_table_cell_boundaries + 2;
4458 pipe_offs = (OFF*) malloc(size: n * sizeof(OFF));
4459 if(pipe_offs == NULL) {
4460 MD_LOG("malloc() failed.");
4461 ret = -1;
4462 goto abort;
4463 }
4464 j = 0;
4465 pipe_offs[j++] = beg;
4466 for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4467 MD_MARK* mark = &ctx->marks[i];
4468 pipe_offs[j++] = mark->end;
4469 }
4470 pipe_offs[j++] = end+1;
4471
4472 /* Process cells. */
4473 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4474 k = 0;
4475 for(i = 0; i < j-1 && k < col_count; i++) {
4476 if(pipe_offs[i] < pipe_offs[i+1]-1)
4477 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4478 }
4479 /* Make sure we call enough table cells even if the current table contains
4480 * too few of them. */
4481 while(k < col_count)
4482 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4483 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4484
4485abort:
4486 free(ptr: pipe_offs);
4487
4488 /* Free any temporary memory blocks stored within some dummy marks. */
4489 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4490 free(ptr: md_mark_get_ptr(ctx, mark_index: i));
4491 PTR_CHAIN.head = -1;
4492 PTR_CHAIN.tail = -1;
4493
4494 return ret;
4495}
4496
4497static int
4498md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4499{
4500 MD_ALIGN* align;
4501 int i;
4502 int ret = 0;
4503
4504 /* At least two lines have to be present: The column headers and the line
4505 * with the underlines. */
4506 MD_ASSERT(n_lines >= 2);
4507
4508 align = malloc(size: col_count * sizeof(MD_ALIGN));
4509 if(align == NULL) {
4510 MD_LOG("malloc() failed.");
4511 ret = -1;
4512 goto abort;
4513 }
4514
4515 md_analyze_table_alignment(ctx, beg: lines[1].beg, end: lines[1].end, align, n_align: col_count);
4516
4517 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4518 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4519 lines[0].beg, lines[0].end, align, col_count));
4520 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4521
4522 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4523 for(i = 2; i < n_lines; i++) {
4524 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4525 lines[i].beg, lines[i].end, align, col_count));
4526 }
4527 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4528
4529abort:
4530 free(ptr: align);
4531 return ret;
4532}
4533
4534
4535/**************************
4536 *** Processing Block ***
4537 **************************/
4538
4539#define MD_BLOCK_CONTAINER_OPENER 0x01
4540#define MD_BLOCK_CONTAINER_CLOSER 0x02
4541#define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4542#define MD_BLOCK_LOOSE_LIST 0x04
4543#define MD_BLOCK_SETEXT_HEADER 0x08
4544
4545struct MD_BLOCK_tag {
4546 MD_BLOCKTYPE type : 8;
4547 unsigned flags : 8;
4548
4549 /* MD_BLOCK_H: Header level (1 - 6)
4550 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented.
4551 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' ').
4552 * MD_BLOCK_TABLE: Column count (as determined by the table underline).
4553 */
4554 unsigned data : 16;
4555
4556 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4557 * MD_BLOCK_LI: Task mark offset in the input doc.
4558 * MD_BLOCK_OL: Start item number.
4559 */
4560 unsigned n_lines;
4561};
4562
4563struct MD_CONTAINER_tag {
4564 CHAR ch;
4565 unsigned is_loose : 8;
4566 unsigned is_task : 8;
4567 unsigned start;
4568 unsigned mark_indent;
4569 unsigned contents_indent;
4570 OFF block_byte_off;
4571 OFF task_mark_off;
4572};
4573
4574
4575static int
4576md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4577{
4578 int i;
4579 int ret;
4580
4581 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4582 MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4583
4584abort:
4585 /* Free any temporary memory blocks stored within some dummy marks. */
4586 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4587 free(ptr: md_mark_get_ptr(ctx, mark_index: i));
4588 PTR_CHAIN.head = -1;
4589 PTR_CHAIN.tail = -1;
4590
4591 return ret;
4592}
4593
4594static int
4595md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4596{
4597 static const CHAR indent_chunk_str[] = _T(" ");
4598 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4599
4600 int i;
4601 int ret = 0;
4602
4603 for(i = 0; i < n_lines; i++) {
4604 const MD_VERBATIMLINE* line = &lines[i];
4605 int indent = line->indent;
4606
4607 MD_ASSERT(indent >= 0);
4608
4609 /* Output code indentation. */
4610 while(indent > (int) indent_chunk_size) {
4611 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4612 indent -= indent_chunk_size;
4613 }
4614 if(indent > 0)
4615 MD_TEXT(text_type, indent_chunk_str, indent);
4616
4617 /* Output the code line itself. */
4618 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4619
4620 /* Enforce end-of-line. */
4621 MD_TEXT(text_type, _T("\n"), 1);
4622 }
4623
4624abort:
4625 return ret;
4626}
4627
4628static int
4629md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4630{
4631 if(is_fenced) {
4632 /* Skip the first line in case of fenced code: It is the fence.
4633 * (Only the starting fence is present due to logic in md_analyze_line().) */
4634 lines++;
4635 n_lines--;
4636 } else {
4637 /* Ignore blank lines at start/end of indented code block. */
4638 while(n_lines > 0 && lines[0].beg == lines[0].end) {
4639 lines++;
4640 n_lines--;
4641 }
4642 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) {
4643 n_lines--;
4644 }
4645 }
4646
4647 if(n_lines == 0)
4648 return 0;
4649
4650 return md_process_verbatim_block_contents(ctx, text_type: MD_TEXT_CODE, lines, n_lines);
4651}
4652
4653static int
4654md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4655 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4656{
4657 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4658 OFF beg = fence_line->beg;
4659 OFF end = fence_line->end;
4660 OFF lang_end;
4661 CHAR fence_ch = CH(fence_line->beg);
4662 int ret = 0;
4663
4664 /* Skip the fence itself. */
4665 while(beg < ctx->size && CH(beg) == fence_ch)
4666 beg++;
4667 /* Trim initial spaces. */
4668 while(beg < ctx->size && CH(beg) == _T(' '))
4669 beg++;
4670
4671 /* Trim trailing spaces. */
4672 while(end > beg && CH(end-1) == _T(' '))
4673 end--;
4674
4675 /* Build info string attribute. */
4676 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4677
4678 /* Build info string attribute. */
4679 lang_end = beg;
4680 while(lang_end < end && !ISWHITESPACE(lang_end))
4681 lang_end++;
4682 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4683
4684 det->fence_char = fence_ch;
4685
4686abort:
4687 return ret;
4688}
4689
4690static int
4691md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4692{
4693 union {
4694 MD_BLOCK_H_DETAIL header;
4695 MD_BLOCK_CODE_DETAIL code;
4696 } det;
4697 MD_ATTRIBUTE_BUILD info_build;
4698 MD_ATTRIBUTE_BUILD lang_build;
4699 int is_in_tight_list;
4700 int clean_fence_code_detail = FALSE;
4701 int ret = 0;
4702
4703 memset(s: &det, c: 0, n: sizeof(det));
4704
4705 if(ctx->n_containers == 0)
4706 is_in_tight_list = FALSE;
4707 else
4708 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4709
4710 switch(block->type) {
4711 case MD_BLOCK_H:
4712 det.header.level = block->data;
4713 break;
4714
4715 case MD_BLOCK_CODE:
4716 /* For fenced code block, we may need to set the info string. */
4717 if(block->data != 0) {
4718 memset(s: &det.code, c: 0, n: sizeof(MD_BLOCK_CODE_DETAIL));
4719 clean_fence_code_detail = TRUE;
4720 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4721 }
4722 break;
4723
4724 default:
4725 /* Noop. */
4726 break;
4727 }
4728
4729 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4730 MD_ENTER_BLOCK(block->type, (void*) &det);
4731
4732 /* Process the block contents accordingly to is type. */
4733 switch(block->type) {
4734 case MD_BLOCK_HR:
4735 /* noop */
4736 break;
4737
4738 case MD_BLOCK_CODE:
4739 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4740 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4741 break;
4742
4743 case MD_BLOCK_HTML:
4744 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4745 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4746 break;
4747
4748 case MD_BLOCK_TABLE:
4749 MD_CHECK(md_process_table_block_contents(ctx, block->data,
4750 (const MD_LINE*)(block + 1), block->n_lines));
4751 break;
4752
4753 default:
4754 MD_CHECK(md_process_normal_block_contents(ctx,
4755 (const MD_LINE*)(block + 1), block->n_lines));
4756 break;
4757 }
4758
4759 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4760 MD_LEAVE_BLOCK(block->type, (void*) &det);
4761
4762abort:
4763 if(clean_fence_code_detail) {
4764 md_free_attribute(ctx, build: &info_build);
4765 md_free_attribute(ctx, build: &lang_build);
4766 }
4767 return ret;
4768}
4769
4770static int
4771md_process_all_blocks(MD_CTX* ctx)
4772{
4773 int byte_off = 0;
4774 int ret = 0;
4775
4776 /* ctx->containers now is not needed for detection of lists and list items
4777 * so we reuse it for tracking what lists are loose or tight. We rely
4778 * on the fact the vector is large enough to hold the deepest nesting
4779 * level of lists. */
4780 ctx->n_containers = 0;
4781
4782 while(byte_off < ctx->n_block_bytes) {
4783 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4784 union {
4785 MD_BLOCK_UL_DETAIL ul;
4786 MD_BLOCK_OL_DETAIL ol;
4787 MD_BLOCK_LI_DETAIL li;
4788 } det;
4789
4790 switch(block->type) {
4791 case MD_BLOCK_UL:
4792 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4793 det.ul.mark = (CHAR) block->data;
4794 break;
4795
4796 case MD_BLOCK_OL:
4797 det.ol.start = block->n_lines;
4798 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4799 det.ol.mark_delimiter = (CHAR) block->data;
4800 break;
4801
4802 case MD_BLOCK_LI:
4803 det.li.is_task = (block->data != 0);
4804 det.li.task_mark = (CHAR) block->data;
4805 det.li.task_mark_offset = (OFF) block->n_lines;
4806 break;
4807
4808 default:
4809 /* noop */
4810 break;
4811 }
4812
4813 if(block->flags & MD_BLOCK_CONTAINER) {
4814 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4815 MD_LEAVE_BLOCK(block->type, &det);
4816
4817 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4818 ctx->n_containers--;
4819 }
4820
4821 if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4822 MD_ENTER_BLOCK(block->type, &det);
4823
4824 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4825 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4826 ctx->n_containers++;
4827 } else if(block->type == MD_BLOCK_QUOTE) {
4828 /* This causes that any text in a block quote, even if
4829 * nested inside a tight list item, is wrapped with
4830 * <p>...</p>. */
4831 ctx->containers[ctx->n_containers].is_loose = TRUE;
4832 ctx->n_containers++;
4833 }
4834 }
4835 } else {
4836 MD_CHECK(md_process_leaf_block(ctx, block));
4837
4838 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4839 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4840 else
4841 byte_off += block->n_lines * sizeof(MD_LINE);
4842 }
4843
4844 byte_off += sizeof(MD_BLOCK);
4845 }
4846
4847 ctx->n_block_bytes = 0;
4848
4849abort:
4850 return ret;
4851}
4852
4853
4854/************************************
4855 *** Grouping Lines into Blocks ***
4856 ************************************/
4857
4858static void*
4859md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4860{
4861 void* ptr;
4862
4863 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4864 void* new_block_bytes;
4865
4866 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4867 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4868 : 512);
4869 new_block_bytes = realloc(ptr: ctx->block_bytes, size: ctx->alloc_block_bytes);
4870 if(new_block_bytes == NULL) {
4871 MD_LOG("realloc() failed.");
4872 return NULL;
4873 }
4874
4875 /* Fix the ->current_block after the reallocation. */
4876 if(ctx->current_block != NULL) {
4877 OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes;
4878 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4879 }
4880
4881 ctx->block_bytes = new_block_bytes;
4882 }
4883
4884 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4885 ctx->n_block_bytes += n_bytes;
4886 return ptr;
4887}
4888
4889static int
4890md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4891{
4892 MD_BLOCK* block;
4893
4894 MD_ASSERT(ctx->current_block == NULL);
4895
4896 block = (MD_BLOCK*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_BLOCK));
4897 if(block == NULL)
4898 return -1;
4899
4900 switch(line->type) {
4901 case MD_LINE_HR:
4902 block->type = MD_BLOCK_HR;
4903 break;
4904
4905 case MD_LINE_ATXHEADER:
4906 case MD_LINE_SETEXTHEADER:
4907 block->type = MD_BLOCK_H;
4908 break;
4909
4910 case MD_LINE_FENCEDCODE:
4911 case MD_LINE_INDENTEDCODE:
4912 block->type = MD_BLOCK_CODE;
4913 break;
4914
4915 case MD_LINE_TEXT:
4916 block->type = MD_BLOCK_P;
4917 break;
4918
4919 case MD_LINE_HTML:
4920 block->type = MD_BLOCK_HTML;
4921 break;
4922
4923 case MD_LINE_BLANK:
4924 case MD_LINE_SETEXTUNDERLINE:
4925 case MD_LINE_TABLEUNDERLINE:
4926 default:
4927 MD_UNREACHABLE();
4928 break;
4929 }
4930
4931 block->flags = 0;
4932 block->data = line->data;
4933 block->n_lines = 0;
4934
4935 ctx->current_block = block;
4936 return 0;
4937}
4938
4939/* Eat from start of current (textual) block any reference definitions and
4940 * remember them so we can resolve any links referring to them.
4941 *
4942 * (Reference definitions can only be at start of it as they cannot break
4943 * a paragraph.)
4944 */
4945static int
4946md_consume_link_reference_definitions(MD_CTX* ctx)
4947{
4948 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4949 int n_lines = ctx->current_block->n_lines;
4950 int n = 0;
4951
4952 /* Compute how many lines at the start of the block form one or more
4953 * reference definitions. */
4954 while(n < n_lines) {
4955 int n_link_ref_lines;
4956
4957 n_link_ref_lines = md_is_link_reference_definition(ctx,
4958 lines: lines + n, n_lines: n_lines - n);
4959 /* Not a reference definition? */
4960 if(n_link_ref_lines == 0)
4961 break;
4962
4963 /* We fail if it is the ref. def. but it could not be stored due
4964 * a memory allocation error. */
4965 if(n_link_ref_lines < 0)
4966 return -1;
4967
4968 n += n_link_ref_lines;
4969 }
4970
4971 /* If there was at least one reference definition, we need to remove
4972 * its lines from the block, or perhaps even the whole block. */
4973 if(n > 0) {
4974 if(n == n_lines) {
4975 /* Remove complete block. */
4976 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4977 ctx->n_block_bytes -= sizeof(MD_BLOCK);
4978 ctx->current_block = NULL;
4979 } else {
4980 /* Remove just some initial lines from the block. */
4981 memmove(dest: lines, src: lines + n, n: (n_lines - n) * sizeof(MD_LINE));
4982 ctx->current_block->n_lines -= n;
4983 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4984 }
4985 }
4986
4987 return 0;
4988}
4989
4990static int
4991md_end_current_block(MD_CTX* ctx)
4992{
4993 int ret = 0;
4994
4995 if(ctx->current_block == NULL)
4996 return ret;
4997
4998 /* Check whether there is a reference definition. (We do this here instead
4999 * of in md_analyze_line() because reference definition can take multiple
5000 * lines.) */
5001 if(ctx->current_block->type == MD_BLOCK_P ||
5002 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
5003 {
5004 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
5005 if(CH(lines[0].beg) == _T('[')) {
5006 MD_CHECK(md_consume_link_reference_definitions(ctx));
5007 if(ctx->current_block == NULL)
5008 return ret;
5009 }
5010 }
5011
5012 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
5013 int n_lines = ctx->current_block->n_lines;
5014
5015 if(n_lines > 1) {
5016 /* Get rid of the underline. */
5017 ctx->current_block->n_lines--;
5018 ctx->n_block_bytes -= sizeof(MD_LINE);
5019 } else {
5020 /* Only the underline has left after eating the ref. defs.
5021 * Keep the line as beginning of a new ordinary paragraph. */
5022 ctx->current_block->type = MD_BLOCK_P;
5023 return 0;
5024 }
5025 }
5026
5027 /* Mark we are not building any block anymore. */
5028 ctx->current_block = NULL;
5029
5030abort:
5031 return ret;
5032}
5033
5034static int
5035md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5036{
5037 MD_ASSERT(ctx->current_block != NULL);
5038
5039 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5040 MD_VERBATIMLINE* line;
5041
5042 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_VERBATIMLINE));
5043 if(line == NULL)
5044 return -1;
5045
5046 line->indent = analysis->indent;
5047 line->beg = analysis->beg;
5048 line->end = analysis->end;
5049 } else {
5050 MD_LINE* line;
5051
5052 line = (MD_LINE*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_LINE));
5053 if(line == NULL)
5054 return -1;
5055
5056 line->beg = analysis->beg;
5057 line->end = analysis->end;
5058 }
5059 ctx->current_block->n_lines++;
5060
5061 return 0;
5062}
5063
5064static int
5065md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5066 unsigned data, unsigned flags)
5067{
5068 MD_BLOCK* block;
5069 int ret = 0;
5070
5071 MD_CHECK(md_end_current_block(ctx));
5072
5073 block = (MD_BLOCK*) md_push_block_bytes(ctx, n_bytes: sizeof(MD_BLOCK));
5074 if(block == NULL)
5075 return -1;
5076
5077 block->type = type;
5078 block->flags = flags;
5079 block->data = data;
5080 block->n_lines = start;
5081
5082abort:
5083 return ret;
5084}
5085
5086
5087
5088/***********************
5089 *** Line Analysis ***
5090 ***********************/
5091
5092static int
5093md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5094{
5095 OFF off = beg + 1;
5096 int n = 1;
5097
5098 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5099 if(CH(off) == CH(beg))
5100 n++;
5101 off++;
5102 }
5103
5104 if(n < 3) {
5105 *p_killer = off;
5106 return FALSE;
5107 }
5108
5109 /* Nothing else can be present on the line. */
5110 if(off < ctx->size && !ISNEWLINE(off)) {
5111 *p_killer = off;
5112 return FALSE;
5113 }
5114
5115 *p_end = off;
5116 return TRUE;
5117}
5118
5119static int
5120md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5121{
5122 int n;
5123 OFF off = beg + 1;
5124
5125 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7)
5126 off++;
5127 n = off - beg;
5128
5129 if(n > 6)
5130 return FALSE;
5131 *p_level = n;
5132
5133 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size &&
5134 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off))
5135 return FALSE;
5136
5137 while(off < ctx->size && CH(off) == _T(' '))
5138 off++;
5139 *p_beg = off;
5140 *p_end = off;
5141 return TRUE;
5142}
5143
5144static int
5145md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5146{
5147 OFF off = beg + 1;
5148
5149 while(off < ctx->size && CH(off) == CH(beg))
5150 off++;
5151
5152 /* Optionally, space(s) can follow. */
5153 while(off < ctx->size && CH(off) == _T(' '))
5154 off++;
5155
5156 /* But nothing more is allowed on the line. */
5157 if(off < ctx->size && !ISNEWLINE(off))
5158 return FALSE;
5159
5160 *p_level = (CH(beg) == _T('=') ? 1 : 2);
5161 *p_end = off;
5162 return TRUE;
5163}
5164
5165static int
5166md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5167{
5168 OFF off = beg;
5169 int found_pipe = FALSE;
5170 unsigned col_count = 0;
5171
5172 if(off < ctx->size && CH(off) == _T('|')) {
5173 found_pipe = TRUE;
5174 off++;
5175 while(off < ctx->size && ISWHITESPACE(off))
5176 off++;
5177 }
5178
5179 while(1) {
5180 OFF cell_beg;
5181 int delimited = FALSE;
5182
5183 /* Cell underline ("-----", ":----", "----:" or ":----:") */
5184 cell_beg = off;
5185 if(off < ctx->size && CH(off) == _T(':'))
5186 off++;
5187 while(off < ctx->size && CH(off) == _T('-'))
5188 off++;
5189 if(off < ctx->size && CH(off) == _T(':'))
5190 off++;
5191 if(off - cell_beg < 3)
5192 return FALSE;
5193
5194 col_count++;
5195
5196 /* Pipe delimiter (optional at the end of line). */
5197 while(off < ctx->size && ISWHITESPACE(off))
5198 off++;
5199 if(off < ctx->size && CH(off) == _T('|')) {
5200 delimited = TRUE;
5201 found_pipe = TRUE;
5202 off++;
5203 while(off < ctx->size && ISWHITESPACE(off))
5204 off++;
5205 }
5206
5207 /* Success, if we reach end of line. */
5208 if(off >= ctx->size || ISNEWLINE(off))
5209 break;
5210
5211 if(!delimited)
5212 return FALSE;
5213 }
5214
5215 if(!found_pipe)
5216 return FALSE;
5217
5218 *p_end = off;
5219 *p_col_count = col_count;
5220 return TRUE;
5221}
5222
5223static int
5224md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5225{
5226 OFF off = beg;
5227
5228 while(off < ctx->size && CH(off) == CH(beg))
5229 off++;
5230
5231 /* Fence must have at least three characters. */
5232 if(off - beg < 3)
5233 return FALSE;
5234
5235 ctx->code_fence_length = off - beg;
5236
5237 /* Optionally, space(s) can follow. */
5238 while(off < ctx->size && CH(off) == _T(' '))
5239 off++;
5240
5241 /* Optionally, an info string can follow. */
5242 while(off < ctx->size && !ISNEWLINE(off)) {
5243 /* Backtick-based fence must not contain '`' in the info string. */
5244 if(CH(beg) == _T('`') && CH(off) == _T('`'))
5245 return FALSE;
5246 off++;
5247 }
5248
5249 *p_end = off;
5250 return TRUE;
5251}
5252
5253static int
5254md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5255{
5256 OFF off = beg;
5257 int ret = FALSE;
5258
5259 /* Closing fence must have at least the same length and use same char as
5260 * opening one. */
5261 while(off < ctx->size && CH(off) == ch)
5262 off++;
5263 if(off - beg < ctx->code_fence_length)
5264 goto out;
5265
5266 /* Optionally, space(s) can follow */
5267 while(off < ctx->size && CH(off) == _T(' '))
5268 off++;
5269
5270 /* But nothing more is allowed on the line. */
5271 if(off < ctx->size && !ISNEWLINE(off))
5272 goto out;
5273
5274 ret = TRUE;
5275
5276out:
5277 /* Note we set *p_end even on failure: If we are not closing fence, caller
5278 * would eat the line anyway without any parsing. */
5279 *p_end = off;
5280 return ret;
5281}
5282
5283/* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5284 * (Refer to CommonMark specification for details about the types.)
5285 */
5286static int
5287md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5288{
5289 typedef struct TAG_tag TAG;
5290 struct TAG_tag {
5291 const CHAR* name;
5292 unsigned len : 8;
5293 };
5294
5295 /* Type 6 is started by a long list of allowed tags. We use two-level
5296 * tree to speed-up the search. */
5297#ifdef X
5298 #undef X
5299#endif
5300#define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5301#define Xend { NULL, 0 }
5302 static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5303
5304 static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5305 static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5306 static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5307 static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5308 X("div"), X("dl"), X("dt"), Xend };
5309 static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5310 X("form"), X("frame"), X("frameset"), Xend };
5311 static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5312 static const TAG i6[] = { X("iframe"), Xend };
5313 static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5314 static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5315 static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5316 static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5317 static const TAG p6[] = { X("p"), X("param"), Xend };
5318 static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5319 static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5320 X("thead"), X("title"), X("tr"), X("track"), Xend };
5321 static const TAG u6[] = { X("ul"), Xend };
5322 static const TAG xx[] = { Xend };
5323#undef X
5324
5325 static const TAG* map6[26] = {
5326 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5327 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5328 };
5329 OFF off = beg + 1;
5330 int i;
5331
5332 /* Check for type 1: <script, <pre, or <style */
5333 for(i = 0; t1[i].name != NULL; i++) {
5334 if(off + t1[i].len <= ctx->size) {
5335 if(md_ascii_case_eq(STR(off), s2: t1[i].name, n: t1[i].len))
5336 return 1;
5337 }
5338 }
5339
5340 /* Check for type 2: <!-- */
5341 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-'))
5342 return 2;
5343
5344 /* Check for type 3: <? */
5345 if(off < ctx->size && CH(off) == _T('?'))
5346 return 3;
5347
5348 /* Check for type 4 or 5: <! */
5349 if(off < ctx->size && CH(off) == _T('!')) {
5350 /* Check for type 4: <! followed by uppercase letter. */
5351 if(off + 1 < ctx->size && ISUPPER(off+1))
5352 return 4;
5353
5354 /* Check for type 5: <![CDATA[ */
5355 if(off + 8 < ctx->size) {
5356 if(md_ascii_eq(STR(off), _T("![CDATA["), n: 8))
5357 return 5;
5358 }
5359 }
5360
5361 /* Check for type 6: Many possible starting tags listed above. */
5362 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5363 int slot;
5364 const TAG* tags;
5365
5366 if(CH(off) == _T('/'))
5367 off++;
5368
5369 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5370 tags = map6[slot];
5371
5372 for(i = 0; tags[i].name != NULL; i++) {
5373 if(off + tags[i].len <= ctx->size) {
5374 if(md_ascii_case_eq(STR(off), s2: tags[i].name, n: tags[i].len)) {
5375 OFF tmp = off + tags[i].len;
5376 if(tmp >= ctx->size)
5377 return 6;
5378 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5379 return 6;
5380 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5381 return 6;
5382 break;
5383 }
5384 }
5385 }
5386 }
5387
5388 /* Check for type 7: any COMPLETE other opening or closing tag. */
5389 if(off + 1 < ctx->size) {
5390 OFF end;
5391
5392 if(md_is_html_tag(ctx, NULL, n_lines: 0, beg, max_end: ctx->size, p_end: &end)) {
5393 /* Only optional whitespace and new line may follow. */
5394 while(end < ctx->size && ISWHITESPACE(end))
5395 end++;
5396 if(end >= ctx->size || ISNEWLINE(end))
5397 return 7;
5398 }
5399 }
5400
5401 return FALSE;
5402}
5403
5404/* Case sensitive check whether there is a substring 'what' between 'beg'
5405 * and end of line. */
5406static int
5407md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5408{
5409 OFF i;
5410 for(i = beg; i + what_len < ctx->size; i++) {
5411 if(ISNEWLINE(i))
5412 break;
5413 if(memcmp(STR(i), s2: what, n: what_len * sizeof(CHAR)) == 0) {
5414 *p_end = i + what_len;
5415 return TRUE;
5416 }
5417 }
5418
5419 *p_end = i;
5420 return FALSE;
5421}
5422
5423/* Returns type of HTML block end condition or FALSE if not an end condition.
5424 *
5425 * Note it fills p_end even when it is not end condition as the caller
5426 * does not need to analyze contents of a raw HTML block.
5427 */
5428static int
5429md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5430{
5431 switch(ctx->html_block_type) {
5432 case 1:
5433 {
5434 OFF off = beg;
5435
5436 while(off < ctx->size && !ISNEWLINE(off)) {
5437 if(CH(off) == _T('<')) {
5438 if(md_ascii_case_eq(STR(off), _T("</script>"), n: 9)) {
5439 *p_end = off + 9;
5440 return TRUE;
5441 }
5442
5443 if(md_ascii_case_eq(STR(off), _T("</style>"), n: 8)) {
5444 *p_end = off + 8;
5445 return TRUE;
5446 }
5447
5448 if(md_ascii_case_eq(STR(off), _T("</pre>"), n: 6)) {
5449 *p_end = off + 6;
5450 return TRUE;
5451 }
5452 }
5453
5454 off++;
5455 }
5456 *p_end = off;
5457 return FALSE;
5458 }
5459
5460 case 2:
5461 return (md_line_contains(ctx, beg, _T("-->"), what_len: 3, p_end) ? 2 : FALSE);
5462
5463 case 3:
5464 return (md_line_contains(ctx, beg, _T("?>"), what_len: 2, p_end) ? 3 : FALSE);
5465
5466 case 4:
5467 return (md_line_contains(ctx, beg, _T(">"), what_len: 1, p_end) ? 4 : FALSE);
5468
5469 case 5:
5470 return (md_line_contains(ctx, beg, _T("]]>"), what_len: 3, p_end) ? 5 : FALSE);
5471
5472 case 6: /* Pass through */
5473 case 7:
5474 *p_end = beg;
5475 return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5476
5477 default:
5478 MD_UNREACHABLE();
5479 }
5480 return FALSE;
5481}
5482
5483
5484static int
5485md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5486{
5487 /* Block quote has no "items" like lists. */
5488 if(container->ch == _T('>'))
5489 return FALSE;
5490
5491 if(container->ch != pivot->ch)
5492 return FALSE;
5493 if(container->mark_indent > pivot->contents_indent)
5494 return FALSE;
5495
5496 return TRUE;
5497}
5498
5499static int
5500md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5501{
5502 if(ctx->n_containers >= ctx->alloc_containers) {
5503 MD_CONTAINER* new_containers;
5504
5505 ctx->alloc_containers = (ctx->alloc_containers > 0
5506 ? ctx->alloc_containers + ctx->alloc_containers / 2
5507 : 16);
5508 new_containers = realloc(ptr: ctx->containers, size: ctx->alloc_containers * sizeof(MD_CONTAINER));
5509 if(new_containers == NULL) {
5510 MD_LOG("realloc() failed.");
5511 return -1;
5512 }
5513
5514 ctx->containers = new_containers;
5515 }
5516
5517 memcpy(dest: &ctx->containers[ctx->n_containers++], src: container, n: sizeof(MD_CONTAINER));
5518 return 0;
5519}
5520
5521static int
5522md_enter_child_containers(MD_CTX* ctx, int n_children)
5523{
5524 int i;
5525 int ret = 0;
5526
5527 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5528 MD_CONTAINER* c = &ctx->containers[i];
5529 int is_ordered_list = FALSE;
5530
5531 switch(c->ch) {
5532 case _T(')'):
5533 case _T('.'):
5534 is_ordered_list = TRUE;
5535 /* Pass through */
5536
5537 case _T('-'):
5538 case _T('+'):
5539 case _T('*'):
5540 /* Remember offset in ctx->block_bytes so we can revisit the
5541 * block if we detect it is a loose list. */
5542 md_end_current_block(ctx);
5543 c->block_byte_off = ctx->n_block_bytes;
5544
5545 MD_CHECK(md_push_container_bytes(ctx,
5546 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5547 c->start, c->ch, MD_BLOCK_CONTAINER_OPENER));
5548 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5549 c->task_mark_off,
5550 (c->is_task ? CH(c->task_mark_off) : 0),
5551 MD_BLOCK_CONTAINER_OPENER));
5552 break;
5553
5554 case _T('>'):
5555 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5556 break;
5557
5558 default:
5559 MD_UNREACHABLE();
5560 break;
5561 }
5562 }
5563
5564abort:
5565 return ret;
5566}
5567
5568static int
5569md_leave_child_containers(MD_CTX* ctx, int n_keep)
5570{
5571 int ret = 0;
5572
5573 while(ctx->n_containers > n_keep) {
5574 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5575 int is_ordered_list = FALSE;
5576
5577 switch(c->ch) {
5578 case _T(')'):
5579 case _T('.'):
5580 is_ordered_list = TRUE;
5581 /* Pass through */
5582
5583 case _T('-'):
5584 case _T('+'):
5585 case _T('*'):
5586 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5587 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5588 MD_BLOCK_CONTAINER_CLOSER));
5589 MD_CHECK(md_push_container_bytes(ctx,
5590 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5591 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5592 break;
5593
5594 case _T('>'):
5595 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5596 0, MD_BLOCK_CONTAINER_CLOSER));
5597 break;
5598
5599 default:
5600 MD_UNREACHABLE();
5601 break;
5602 }
5603
5604 ctx->n_containers--;
5605 }
5606
5607abort:
5608 return ret;
5609}
5610
5611static int
5612md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5613{
5614 OFF off = beg;
5615 OFF max_end;
5616
5617 if(indent >= ctx->code_indent_offset)
5618 return FALSE;
5619
5620 /* Check for block quote mark. */
5621 if(off < ctx->size && CH(off) == _T('>')) {
5622 off++;
5623 p_container->ch = _T('>');
5624 p_container->is_loose = FALSE;
5625 p_container->is_task = FALSE;
5626 p_container->mark_indent = indent;
5627 p_container->contents_indent = indent + 1;
5628 *p_end = off;
5629 return TRUE;
5630 }
5631
5632 /* Check for list item bullet mark. */
5633 if(off+1 < ctx->size && ISANYOF(off, _T("-+*")) && (ISBLANK(off+1) || ISNEWLINE(off+1))) {
5634 p_container->ch = CH(off);
5635 p_container->is_loose = FALSE;
5636 p_container->is_task = FALSE;
5637 p_container->mark_indent = indent;
5638 p_container->contents_indent = indent + 1;
5639 *p_end = off + 1;
5640 return TRUE;
5641 }
5642
5643 /* Check for ordered list item marks. */
5644 max_end = off + 9;
5645 if(max_end > ctx->size)
5646 max_end = ctx->size;
5647 p_container->start = 0;
5648 while(off < max_end && ISDIGIT(off)) {
5649 p_container->start = p_container->start * 10 + CH(off) - _T('0');
5650 off++;
5651 }
5652 if(off > beg && off+1 < ctx->size &&
5653 (CH(off) == _T('.') || CH(off) == _T(')')) &&
5654 (ISBLANK(off+1) || ISNEWLINE(off+1)))
5655 {
5656 p_container->ch = CH(off);
5657 p_container->is_loose = FALSE;
5658 p_container->is_task = FALSE;
5659 p_container->mark_indent = indent;
5660 p_container->contents_indent = indent + off - beg + 1;
5661 *p_end = off + 1;
5662 return TRUE;
5663 }
5664
5665 return FALSE;
5666}
5667
5668static unsigned
5669md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5670{
5671 OFF off = beg;
5672 unsigned indent = total_indent;
5673
5674 while(off < ctx->size && ISBLANK(off)) {
5675 if(CH(off) == _T('\t'))
5676 indent = (indent + 4) & ~3;
5677 else
5678 indent++;
5679 off++;
5680 }
5681
5682 *p_end = off;
5683 return indent - total_indent;
5684}
5685
5686static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0 };
5687
5688/* Analyze type of the line and find some its properties. This serves as a
5689 * main input for determining type and boundaries of a block. */
5690static int
5691md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5692 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5693{
5694 unsigned total_indent = 0;
5695 int n_parents = 0;
5696 int n_brothers = 0;
5697 int n_children = 0;
5698 MD_CONTAINER container = { 0 };
5699 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5700 OFF off = beg;
5701 OFF hr_killer = 0;
5702 int ret = 0;
5703
5704 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5705 total_indent += line->indent;
5706 line->beg = off;
5707
5708 /* Given the indentation and block quote marks '>', determine how many of
5709 * the current containers are our parents. */
5710 while(n_parents < ctx->n_containers) {
5711 MD_CONTAINER* c = &ctx->containers[n_parents];
5712
5713 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset &&
5714 off < ctx->size && CH(off) == _T('>'))
5715 {
5716 /* Block quote mark. */
5717 off++;
5718 total_indent++;
5719 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5720 total_indent += line->indent;
5721
5722 /* The optional 1st space after '>' is part of the block quote mark. */
5723 if(line->indent > 0)
5724 line->indent--;
5725
5726 line->beg = off;
5727
5728 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) {
5729 /* List. */
5730 line->indent -= c->contents_indent;
5731 } else {
5732 break;
5733 }
5734
5735 n_parents++;
5736 }
5737
5738 if(off >= ctx->size || ISNEWLINE(off)) {
5739 /* Blank line does not need any real indentation to be nested inside
5740 * a list. */
5741 if(n_brothers + n_children == 0) {
5742 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>'))
5743 n_parents++;
5744 }
5745 }
5746
5747 while(TRUE) {
5748 /* Check whether we are fenced code continuation. */
5749 if(pivot_line->type == MD_LINE_FENCEDCODE) {
5750 line->beg = off;
5751
5752 /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5753 * which we transform into MD_LINE_BLANK. */
5754 if(line->indent < ctx->code_indent_offset) {
5755 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), beg: off, p_end: &off)) {
5756 line->type = MD_LINE_BLANK;
5757 ctx->last_line_has_list_loosening_effect = FALSE;
5758 break;
5759 }
5760 }
5761
5762 /* Change indentation accordingly to the initial code fence. */
5763 if(n_parents == ctx->n_containers) {
5764 if(line->indent > pivot_line->indent)
5765 line->indent -= pivot_line->indent;
5766 else
5767 line->indent = 0;
5768
5769 line->type = MD_LINE_FENCEDCODE;
5770 break;
5771 }
5772 }
5773
5774 /* Check whether we are HTML block continuation. */
5775 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
5776 if(n_parents < ctx->n_containers) {
5777 /* HTML block is implicitly ended if the enclosing container
5778 * block ends. */
5779 ctx->html_block_type = 0;
5780 } else {
5781 int html_block_type;
5782
5783 html_block_type = md_is_html_block_end_condition(ctx, beg: off, p_end: &off);
5784 if(html_block_type > 0) {
5785 MD_ASSERT(html_block_type == ctx->html_block_type);
5786
5787 /* Make sure this is the last line of the block. */
5788 ctx->html_block_type = 0;
5789
5790 /* Some end conditions serve as blank lines at the same time. */
5791 if(html_block_type == 6 || html_block_type == 7) {
5792 line->type = MD_LINE_BLANK;
5793 line->indent = 0;
5794 break;
5795 }
5796 }
5797
5798 line->type = MD_LINE_HTML;
5799 n_parents = ctx->n_containers;
5800 break;
5801 }
5802 }
5803
5804 /* Check for blank line. */
5805 if(off >= ctx->size || ISNEWLINE(off)) {
5806 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) {
5807 line->type = MD_LINE_INDENTEDCODE;
5808 if(line->indent > ctx->code_indent_offset)
5809 line->indent -= ctx->code_indent_offset;
5810 else
5811 line->indent = 0;
5812 ctx->last_line_has_list_loosening_effect = FALSE;
5813 } else {
5814 line->type = MD_LINE_BLANK;
5815 ctx->last_line_has_list_loosening_effect = (n_parents > 0 &&
5816 n_brothers + n_children == 0 &&
5817 ctx->containers[n_parents-1].ch != _T('>'));
5818
5819 #if 1
5820 /* See https://github.com/mity/md4c/issues/6
5821 *
5822 * This ugly checking tests we are in (yet empty) list item but not
5823 * its very first line (with the list item mark).
5824 *
5825 * If we are such blank line, then any following non-blank line
5826 * which would be part of this list item actually ends the list
5827 * because "a list item can begin with at most one blank line."
5828 */
5829 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5830 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5831 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5832 {
5833 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5834 if(top_block->type == MD_BLOCK_LI)
5835 ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5836 }
5837 #endif
5838 }
5839 break;
5840 } else {
5841 #if 1
5842 /* This is 2nd half of the hack. If the flag is set (that is there
5843 * were 2nd blank line at the start of the list item) and we would also
5844 * belonging to such list item, than interrupt the list. */
5845 ctx->last_line_has_list_loosening_effect = FALSE;
5846 if(ctx->last_list_item_starts_with_two_blank_lines) {
5847 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5848 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5849 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5850 {
5851 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5852 if(top_block->type == MD_BLOCK_LI)
5853 n_parents--;
5854 }
5855
5856 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5857 }
5858 #endif
5859 }
5860
5861 /* Check whether we are Setext underline. */
5862 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT
5863 && off < ctx->size && ISANYOF2(off, _T('='), _T('-'))
5864 && (n_parents == ctx->n_containers))
5865 {
5866 unsigned level;
5867
5868 if(md_is_setext_underline(ctx, beg: off, p_end: &off, p_level: &level)) {
5869 line->type = MD_LINE_SETEXTUNDERLINE;
5870 line->data = level;
5871 break;
5872 }
5873 }
5874
5875 /* Check for thematic break line. */
5876 if(line->indent < ctx->code_indent_offset
5877 && off < ctx->size && off >= hr_killer
5878 && ISANYOF(off, _T("-_*")))
5879 {
5880 if(md_is_hr_line(ctx, beg: off, p_end: &off, p_killer: &hr_killer)) {
5881 line->type = MD_LINE_HR;
5882 break;
5883 }
5884 }
5885
5886 /* Check for "brother" container. I.e. whether we are another list item
5887 * in already started list. */
5888 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) {
5889 OFF tmp;
5890
5891 if(md_is_container_mark(ctx, indent: line->indent, beg: off, p_end: &tmp, p_container: &container) &&
5892 md_is_container_compatible(pivot: &ctx->containers[n_parents], container: &container))
5893 {
5894 pivot_line = &md_dummy_blank_line;
5895
5896 off = tmp;
5897
5898 total_indent += container.contents_indent - container.mark_indent;
5899 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5900 total_indent += line->indent;
5901 line->beg = off;
5902
5903 /* Some of the following whitespace actually still belongs to the mark. */
5904 if(off >= ctx->size || ISNEWLINE(off)) {
5905 container.contents_indent++;
5906 } else if(line->indent <= ctx->code_indent_offset) {
5907 container.contents_indent += line->indent;
5908 line->indent = 0;
5909 } else {
5910 container.contents_indent += 1;
5911 line->indent--;
5912 }
5913
5914 ctx->containers[n_parents].mark_indent = container.mark_indent;
5915 ctx->containers[n_parents].contents_indent = container.contents_indent;
5916
5917 n_brothers++;
5918 continue;
5919 }
5920 }
5921
5922 /* Check for indented code.
5923 * Note indented code block cannot interrupt a paragraph. */
5924 if(line->indent >= ctx->code_indent_offset &&
5925 (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5926 {
5927 line->type = MD_LINE_INDENTEDCODE;
5928 MD_ASSERT(line->indent >= ctx->code_indent_offset);
5929 line->indent -= ctx->code_indent_offset;
5930 line->data = 0;
5931 break;
5932 }
5933
5934 /* Check for start of a new container block. */
5935 if(line->indent < ctx->code_indent_offset &&
5936 md_is_container_mark(ctx, indent: line->indent, beg: off, p_end: &off, p_container: &container))
5937 {
5938 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5939 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>'))
5940 {
5941 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5942 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5943 ISANYOF2_(container.ch, _T('.'), _T(')')) && container.start != 1)
5944 {
5945 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5946 } else {
5947 total_indent += container.contents_indent - container.mark_indent;
5948 line->indent = md_line_indentation(ctx, total_indent, beg: off, p_end: &off);
5949 total_indent += line->indent;
5950
5951 line->beg = off;
5952 line->data = container.ch;
5953
5954 /* Some of the following whitespace actually still belongs to the mark. */
5955 if(off >= ctx->size || ISNEWLINE(off)) {
5956 container.contents_indent++;
5957 } else if(line->indent <= ctx->code_indent_offset) {
5958 container.contents_indent += line->indent;
5959 line->indent = 0;
5960 } else {
5961 container.contents_indent += 1;
5962 line->indent--;
5963 }
5964
5965 if(n_brothers + n_children == 0)
5966 pivot_line = &md_dummy_blank_line;
5967
5968 if(n_children == 0)
5969 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
5970
5971 n_children++;
5972 MD_CHECK(md_push_container(ctx, &container));
5973 continue;
5974 }
5975 }
5976
5977 /* Check whether we are table continuation. */
5978 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) {
5979 line->type = MD_LINE_TABLE;
5980 break;
5981 }
5982
5983 /* Check for ATX header. */
5984 if(line->indent < ctx->code_indent_offset &&
5985 off < ctx->size && CH(off) == _T('#'))
5986 {
5987 unsigned level;
5988
5989 if(md_is_atxheader_line(ctx, beg: off, p_beg: &line->beg, p_end: &off, p_level: &level)) {
5990 line->type = MD_LINE_ATXHEADER;
5991 line->data = level;
5992 break;
5993 }
5994 }
5995
5996 /* Check whether we are starting code fence. */
5997 if(off < ctx->size && ISANYOF2(off, _T('`'), _T('~'))) {
5998 if(md_is_opening_code_fence(ctx, beg: off, p_end: &off)) {
5999 line->type = MD_LINE_FENCEDCODE;
6000 line->data = 1;
6001 break;
6002 }
6003 }
6004
6005 /* Check for start of raw HTML block. */
6006 if(off < ctx->size && CH(off) == _T('<')
6007 && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
6008 {
6009 ctx->html_block_type = md_is_html_block_start_condition(ctx, beg: off);
6010
6011 /* HTML block type 7 cannot interrupt paragraph. */
6012 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT)
6013 ctx->html_block_type = 0;
6014
6015 if(ctx->html_block_type > 0) {
6016 /* The line itself also may immediately close the block. */
6017 if(md_is_html_block_end_condition(ctx, beg: off, p_end: &off) == ctx->html_block_type) {
6018 /* Make sure this is the last line of the block. */
6019 ctx->html_block_type = 0;
6020 }
6021
6022 line->type = MD_LINE_HTML;
6023 break;
6024 }
6025 }
6026
6027 /* Check for table underline. */
6028 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT
6029 && off < ctx->size && ISANYOF3(off, _T('|'), _T('-'), _T(':'))
6030 && n_parents == ctx->n_containers)
6031 {
6032 unsigned col_count;
6033
6034 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 &&
6035 md_is_table_underline(ctx, beg: off, p_end: &off, p_col_count: &col_count))
6036 {
6037 line->data = col_count;
6038 line->type = MD_LINE_TABLEUNDERLINE;
6039 break;
6040 }
6041 }
6042
6043 /* By default, we are normal text line. */
6044 line->type = MD_LINE_TEXT;
6045 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) {
6046 /* Lazy continuation. */
6047 n_parents = ctx->n_containers;
6048 }
6049
6050 /* Check for task mark. */
6051 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 &&
6052 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6053 {
6054 OFF tmp = off;
6055
6056 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp))
6057 tmp++;
6058 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') &&
6059 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') &&
6060 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3)))
6061 {
6062 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6063 task_container->is_task = TRUE;
6064 task_container->task_mark_off = tmp + 1;
6065 off = tmp + 3;
6066 while(ISWHITESPACE(off))
6067 off++;
6068 line->beg = off;
6069 }
6070 }
6071
6072 break;
6073 }
6074
6075 /* Scan for end of the line.
6076 *
6077 * Note this is quite a bottleneck of the parsing as we here iterate almost
6078 * over compete document.
6079 */
6080#if defined __linux__ && !defined MD4C_USE_UTF16
6081 /* Recent glibc versions have superbly optimized strcspn(), even using
6082 * vectorization if available. */
6083 if(ctx->doc_ends_with_newline && off < ctx->size) {
6084 while(TRUE) {
6085 off += (OFF) strcspn(STR(off), reject: "\r\n");
6086
6087 /* strcspn() can stop on zero terminator; but that can appear
6088 * anywhere in the Markfown input... */
6089 if(CH(off) == _T('\0'))
6090 off++;
6091 else
6092 break;
6093 }
6094 } else
6095#endif
6096 {
6097 /* Optimization: Use some loop unrolling. */
6098 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1)
6099 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3))
6100 off += 4;
6101 while(off < ctx->size && !ISNEWLINE(off))
6102 off++;
6103 }
6104
6105 /* Set end of the line. */
6106 line->end = off;
6107
6108 /* But for ATX header, we should exclude the optional trailing mark. */
6109 if(line->type == MD_LINE_ATXHEADER) {
6110 OFF tmp = line->end;
6111 while(tmp > line->beg && CH(tmp-1) == _T(' '))
6112 tmp--;
6113 while(tmp > line->beg && CH(tmp-1) == _T('#'))
6114 tmp--;
6115 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6116 line->end = tmp;
6117 }
6118
6119 /* Trim trailing spaces. */
6120 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) {
6121 while(line->end > line->beg && CH(line->end-1) == _T(' '))
6122 line->end--;
6123 }
6124
6125 /* Eat also the new line. */
6126 if(off < ctx->size && CH(off) == _T('\r'))
6127 off++;
6128 if(off < ctx->size && CH(off) == _T('\n'))
6129 off++;
6130
6131 *p_end = off;
6132
6133 /* If we belong to a list after seeing a blank line, the list is loose. */
6134 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) {
6135 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6136 if(c->ch != _T('>')) {
6137 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6138 block->flags |= MD_BLOCK_LOOSE_LIST;
6139 }
6140 }
6141
6142 /* Leave any containers we are not part of anymore. */
6143 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers)
6144 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6145
6146 /* Enter any container we found a mark for. */
6147 if(n_brothers > 0) {
6148 MD_ASSERT(n_brothers == 1);
6149 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6150 ctx->containers[n_parents].task_mark_off,
6151 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6152 MD_BLOCK_CONTAINER_CLOSER));
6153 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6154 container.task_mark_off,
6155 (container.is_task ? CH(container.task_mark_off) : 0),
6156 MD_BLOCK_CONTAINER_OPENER));
6157 ctx->containers[n_parents].is_task = container.is_task;
6158 ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6159 }
6160
6161 if(n_children > 0)
6162 MD_CHECK(md_enter_child_containers(ctx, n_children));
6163
6164abort:
6165 return ret;
6166}
6167
6168static int
6169md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6170{
6171 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6172 int ret = 0;
6173
6174 /* Blank line ends current leaf block. */
6175 if(line->type == MD_LINE_BLANK) {
6176 MD_CHECK(md_end_current_block(ctx));
6177 *p_pivot_line = &md_dummy_blank_line;
6178 return 0;
6179 }
6180
6181 /* Some line types form block on their own. */
6182 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6183 MD_CHECK(md_end_current_block(ctx));
6184
6185 /* Add our single-line block. */
6186 MD_CHECK(md_start_new_block(ctx, line));
6187 MD_CHECK(md_add_line_into_current_block(ctx, line));
6188 MD_CHECK(md_end_current_block(ctx));
6189 *p_pivot_line = &md_dummy_blank_line;
6190 return 0;
6191 }
6192
6193 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6194 if(line->type == MD_LINE_SETEXTUNDERLINE) {
6195 MD_ASSERT(ctx->current_block != NULL);
6196 ctx->current_block->type = MD_BLOCK_H;
6197 ctx->current_block->data = line->data;
6198 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6199 MD_CHECK(md_add_line_into_current_block(ctx, line));
6200 MD_CHECK(md_end_current_block(ctx));
6201 if(ctx->current_block == NULL) {
6202 *p_pivot_line = &md_dummy_blank_line;
6203 } else {
6204 /* This happens if we have consumed all the body as link ref. defs.
6205 * and downgraded the underline into start of a new paragraph block. */
6206 line->type = MD_LINE_TEXT;
6207 *p_pivot_line = line;
6208 }
6209 return 0;
6210 }
6211
6212 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6213 if(line->type == MD_LINE_TABLEUNDERLINE) {
6214 MD_ASSERT(ctx->current_block != NULL);
6215 MD_ASSERT(ctx->current_block->n_lines == 1);
6216 ctx->current_block->type = MD_BLOCK_TABLE;
6217 ctx->current_block->data = line->data;
6218 MD_ASSERT(pivot_line != &md_dummy_blank_line);
6219 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6220 MD_CHECK(md_add_line_into_current_block(ctx, line));
6221 return 0;
6222 }
6223
6224 /* The current block also ends if the line has different type. */
6225 if(line->type != pivot_line->type)
6226 MD_CHECK(md_end_current_block(ctx));
6227
6228 /* The current line may start a new block. */
6229 if(ctx->current_block == NULL) {
6230 MD_CHECK(md_start_new_block(ctx, line));
6231 *p_pivot_line = line;
6232 }
6233
6234 /* In all other cases the line is just a continuation of the current block. */
6235 MD_CHECK(md_add_line_into_current_block(ctx, line));
6236
6237abort:
6238 return ret;
6239}
6240
6241static int
6242md_process_doc(MD_CTX *ctx)
6243{
6244 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6245 MD_LINE_ANALYSIS line_buf[2];
6246 MD_LINE_ANALYSIS* line = &line_buf[0];
6247 OFF off = 0;
6248 int ret = 0;
6249
6250 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6251
6252 while(off < ctx->size) {
6253 if(line == pivot_line)
6254 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6255
6256 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6257 MD_CHECK(md_process_line(ctx, &pivot_line, line));
6258 }
6259
6260 md_end_current_block(ctx);
6261
6262 MD_CHECK(md_build_ref_def_hashtable(ctx));
6263
6264 /* Process all blocks. */
6265 MD_CHECK(md_leave_child_containers(ctx, 0));
6266 MD_CHECK(md_process_all_blocks(ctx));
6267
6268 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6269
6270abort:
6271
6272#if 0
6273 /* Output some memory consumption statistics. */
6274 {
6275 char buffer[256];
6276 sprintf(buffer, "Alloced %u bytes for block buffer.",
6277 (unsigned)(ctx->alloc_block_bytes));
6278 MD_LOG(buffer);
6279
6280 sprintf(buffer, "Alloced %u bytes for containers buffer.",
6281 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6282 MD_LOG(buffer);
6283
6284 sprintf(buffer, "Alloced %u bytes for marks buffer.",
6285 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6286 MD_LOG(buffer);
6287
6288 sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6289 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6290 MD_LOG(buffer);
6291 }
6292#endif
6293
6294 return ret;
6295}
6296
6297
6298/********************
6299 *** Public API ***
6300 ********************/
6301
6302int
6303md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6304{
6305 MD_CTX ctx;
6306 int i;
6307 int ret;
6308
6309 if(parser->abi_version != 0) {
6310 if(parser->debug_log != NULL)
6311 parser->debug_log("Unsupported abi_version.", userdata);
6312 return -1;
6313 }
6314
6315 /* Setup context structure. */
6316 memset(s: &ctx, c: 0, n: sizeof(MD_CTX));
6317 ctx.text = text;
6318 ctx.size = size;
6319 memcpy(dest: &ctx.parser, src: parser, n: sizeof(MD_PARSER));
6320 ctx.userdata = userdata;
6321 ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6322 md_build_mark_char_map(ctx: &ctx);
6323 ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]));
6324
6325 /* Reset all unresolved opener mark chains. */
6326 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6327 ctx.mark_chains[i].head = -1;
6328 ctx.mark_chains[i].tail = -1;
6329 }
6330 ctx.unresolved_link_head = -1;
6331 ctx.unresolved_link_tail = -1;
6332
6333 /* All the work. */
6334 ret = md_process_doc(ctx: &ctx);
6335
6336 /* Clean-up. */
6337 md_free_ref_defs(ctx: &ctx);
6338 md_free_ref_def_hashtable(ctx: &ctx);
6339 free(ptr: ctx.buffer);
6340 free(ptr: ctx.marks);
6341 free(ptr: ctx.block_bytes);
6342 free(ptr: ctx.containers);
6343
6344 return ret;
6345}
6346

source code of qtbase/src/3rdparty/md4c/md4c.c