fix: retain line breaks in tokens properly (#2341)

phillipb · UziTech · web-flow · commit a9696e28989c · 2022-01-06T09:31:58.000-06:00
* Fix lexer and tokenizer to retain line breaks properly

* Add test for bug

* Check for line breaks not just spaces

* Fix lint

* Fix spacing in test

* clean up code

Co-authored-by: Tony Brix &lt;tony@brix.ninja&gt;
diff --git a/src/Lexer.js b/src/Lexer.js
@@ -152,7 +152,11 @@ export class Lexer {
       // newline
       if (token = this.tokenizer.space(src)) {
         src = src.substring(token.raw.length);
-        if (token.type) {
+        if (token.raw.length === 1 && tokens.length > 0) {
+          // if there's a single \n as a spacer, it's terminating the last line,
+          // so move it there so that we don't get unecessary paragraph tags
+          tokens[tokens.length - 1].raw += '\n';
+        } else {
           tokens.push(token);
         }
         continue;
diff --git a/src/Tokenizer.js b/src/Tokenizer.js
@@ -72,14 +72,11 @@ export class Tokenizer {
 
   space(src) {
     const cap = this.rules.block.newline.exec(src);
-    if (cap) {
-      if (cap[0].length > 1) {
-        return {
-          type: 'space',
-          raw: cap[0]
-        };
-      }
-      return { raw: '\n' };
+    if (cap && cap[0].length > 0) {
+      return {
+        type: 'space',
+        raw: cap[0]
+      };
     }
   }
 
@@ -303,7 +300,24 @@ export class Tokenizer {
       for (i = 0; i < l; i++) {
         this.lexer.state.top = false;
         list.items[i].tokens = this.lexer.blockTokens(list.items[i].text, []);
-        if (!list.loose && list.items[i].tokens.some(t => t.type === 'space')) {
+        const spacers = list.items[i].tokens.filter(t => t.type === 'space');
+        const hasMultipleLineBreaks = spacers.every(t => {
+          const chars = t.raw.split('');
+          let lineBreaks = 0;
+          for (const char of chars) {
+            if (char === '\n') {
+              lineBreaks += 1;
+            }
+            if (lineBreaks > 1) {
+              return true;
+            }
+          }
+
+          return false;
+        });
+
+        if (!list.loose && spacers.length && hasMultipleLineBreaks) {
+          // Having a single line break doesn't mean a list is loose. A single line break is terminating the last list item
           list.loose = true;
           list.items[i].loose = true;
         }
diff --git a/test/unit/Lexer-spec.js b/test/unit/Lexer-spec.js
@@ -93,6 +93,10 @@ lheading 2
 ----------
 `,
         tokens: [
+          {
+            type: 'space',
+            raw: '\n'
+          },
           {
             type: 'heading',
             raw: '# heading 1\n\n',
@@ -175,6 +179,9 @@ lheading 2
 | 1 | 2 |
 `,
         tokens: [{
+          type: 'space',
+          raw: '\n'
+        }, {
           type: 'table',
           align: [null, null],
           raw: '| a | b |\n|---|---|\n| 1 | 2 |\n',
@@ -212,40 +219,42 @@ paragraph 1
 |---|---|
 | 1 | 2 |
 `,
-        tokens: [
-          {
-            type: 'paragraph',
-            raw: 'paragraph 1',
-            text: 'paragraph 1',
-            tokens: [{ type: 'text', raw: 'paragraph 1', text: 'paragraph 1' }]
-          },
-          {
-            type: 'table',
-            align: [null, null],
-            raw: '| a | b |\n|---|---|\n| 1 | 2 |\n',
-            header: [
+        tokens: [{
+          type: 'space',
+          raw: '\n'
+        }, {
+          type: 'paragraph',
+          raw: 'paragraph 1\n',
+          text: 'paragraph 1',
+          tokens: [{ type: 'text', raw: 'paragraph 1', text: 'paragraph 1' }]
+        },
+        {
+          type: 'table',
+          align: [null, null],
+          raw: '| a | b |\n|---|---|\n| 1 | 2 |\n',
+          header: [
+            {
+              text: 'a',
+              tokens: [{ type: 'text', raw: 'a', text: 'a' }]
+            },
+            {
+              text: 'b',
+              tokens: [{ type: 'text', raw: 'b', text: 'b' }]
+            }
+          ],
+          rows: [
+            [
               {
-                text: 'a',
-                tokens: [{ type: 'text', raw: 'a', text: 'a' }]
+                text: '1',
+                tokens: [{ type: 'text', raw: '1', text: '1' }]
               },
               {
-                text: 'b',
-                tokens: [{ type: 'text', raw: 'b', text: 'b' }]
+                text: '2',
+                tokens: [{ type: 'text', raw: '2', text: '2' }]
               }
-            ],
-            rows: [
-              [
-                {
-                  text: '1',
-                  tokens: [{ type: 'text', raw: '1', text: '1' }]
-                },
-                {
-                  text: '2',
-                  tokens: [{ type: 'text', raw: '2', text: '2' }]
-                }
-              ]
             ]
-          }
+          ]
+        }
         ]
       });
     });
@@ -258,6 +267,9 @@ paragraph 1
 | 1 | 2 | 3 |
 `,
         tokens: [{
+          type: 'space',
+          raw: '\n'
+        }, {
           type: 'table',
           align: ['left', 'center', 'right'],
           raw: '| a | b | c |\n|:--|:-:|--:|\n| 1 | 2 | 3 |\n',
@@ -302,33 +314,37 @@ a | b
 --|--
 1 | 2
 `,
-        tokens: [{
-          type: 'table',
-          align: [null, null],
-          raw: 'a | b\n--|--\n1 | 2\n',
-          header: [
-            {
-              text: 'a',
-              tokens: [{ type: 'text', raw: 'a', text: 'a' }]
-            },
-            {
-              text: 'b',
-              tokens: [{ type: 'text', raw: 'b', text: 'b' }]
-            }
-          ],
-          rows: [
-            [
+        tokens: [
+          {
+            type: 'space',
+            raw: '\n'
+          }, {
+            type: 'table',
+            align: [null, null],
+            raw: 'a | b\n--|--\n1 | 2\n',
+            header: [
               {
-                text: '1',
-                tokens: [{ type: 'text', raw: '1', text: '1' }]
+                text: 'a',
+                tokens: [{ type: 'text', raw: 'a', text: 'a' }]
               },
               {
-                text: '2',
-                tokens: [{ type: 'text', raw: '2', text: '2' }]
+                text: 'b',
+                tokens: [{ type: 'text', raw: 'b', text: 'b' }]
               }
+            ],
+            rows: [
+              [
+                {
+                  text: '1',
+                  tokens: [{ type: 'text', raw: '1', text: '1' }]
+                },
+                {
+                  text: '2',
+                  tokens: [{ type: 'text', raw: '2', text: '2' }]
+                }
+              ]
             ]
-          ]
-        }]
+          }]
       });
     });
   });
@@ -342,6 +358,19 @@ a | b
         ]
       });
     });
+
+    it('after line break does not consume raw \n', () => {
+      expectTokens({
+        md: 'T\nh\n---',
+        tokens:
+          jasmine.arrayContaining([
+            jasmine.objectContaining({
+              raw: 'T\nh\n'
+            }),
+            { type: 'hr', raw: '---' }
+          ])
+      });
+    });
   });
 
   describe('blockquote', () => {
@@ -376,8 +405,11 @@ a | b
 `,
         tokens: [
           {
+            type: 'space',
+            raw: '\n'
+          }, {
             type: 'list',
-            raw: '- item 1\n- item 2',
+            raw: '- item 1\n- item 2\n',
             ordered: false,
             start: '',
             loose: false,
@@ -423,9 +455,13 @@ a | b
 2. item 2
 `,
         tokens: jasmine.arrayContaining([
+          jasmine.objectContaining({
+            type: 'space',
+            raw: '\n'
+          }),
           jasmine.objectContaining({
             type: 'list',
-            raw: '1. item 1\n2. item 2',
+            raw: '1. item 1\n2. item 2\n',
             ordered: true,
             start: 1,
             items: [
@@ -448,9 +484,13 @@ a | b
 2) item 2
 `,
         tokens: jasmine.arrayContaining([
+          jasmine.objectContaining({
+            type: 'space',
+            raw: '\n'
+          }),
           jasmine.objectContaining({
             type: 'list',
-            raw: '1) item 1\n2) item 2',
+            raw: '1) item 1\n2) item 2\n',
             ordered: true,
             start: 1,
             items: [
@@ -475,6 +515,10 @@ a | b
 paragraph
 `,
         tokens: [
+          {
+            type: 'space',
+            raw: '\n'
+          },
           {
             type: 'list',
             raw: '- item 1\n- item 2',
@@ -515,7 +559,7 @@ paragraph
           { type: 'space', raw: '\n\n' },
           {
             type: 'paragraph',
-            raw: 'paragraph',
+            raw: 'paragraph\n',
             text: 'paragraph',
             tokens: [{
               type: 'text',
@@ -534,9 +578,13 @@ paragraph
 3. item 2
 `,
         tokens: jasmine.arrayContaining([
+          jasmine.objectContaining({
+            type: 'space',
+            raw: '\n'
+          }),
           jasmine.objectContaining({
             type: 'list',
-            raw: '2. item 1\n3. item 2',
+            raw: '2. item 1\n3. item 2\n',
             ordered: true,
             start: 2,
             items: [
@@ -560,9 +608,13 @@ paragraph
 - item 2
 `,
         tokens: jasmine.arrayContaining([
+          jasmine.objectContaining({
+            type: 'space',
+            raw: '\n'
+          }),
           jasmine.objectContaining({
             type: 'list',
-            raw: '- item 1\n\n- item 2',
+            raw: '- item 1\n\n- item 2\n',
             loose: true,
             items: [
               jasmine.objectContaining({
@@ -577,16 +629,54 @@ paragraph
       });
     });
 
+    it('not loose with spaces', () => {
+      expectTokens({
+        md: `
+- item 1
+  - item 2
+`,
+        tokens: jasmine.arrayContaining([
+          jasmine.objectContaining({
+            type: 'space',
+            raw: '\n'
+          }),
+          jasmine.objectContaining({
+            type: 'list',
+            raw: '- item 1\n  - item 2\n',
+            loose: false,
+            items: [
+              jasmine.objectContaining({
+                raw: '- item 1\n  - item 2',
+                tokens: jasmine.arrayContaining([
+                  jasmine.objectContaining({
+                    raw: 'item 1\n'
+                  }),
+                  jasmine.objectContaining({
+                    type: 'list',
+                    raw: '- item 2'
+                  })
+                ])
+              })
+            ]
+          })
+        ])
+      });
+    });
+
     it('task', () => {
       expectTokens({
         md: `
 - [ ] item 1
 - [x] item 2
 `,
         tokens: jasmine.arrayContaining([
+          jasmine.objectContaining({
+            type: 'space',
+            raw: '\n'
+          }),
           jasmine.objectContaining({
             type: 'list',
-            raw: '- [ ] item 1\n- [x] item 2',
+            raw: '- [ ] item 1\n- [x] item 2\n',
             items: [
               jasmine.objectContaining({
                 raw: '- [ ] item 1\n',
diff --git a/test/unit/marked-spec.js b/test/unit/marked-spec.js
@@ -994,6 +994,7 @@ br
     });
 
     expect(tokensSeen).toEqual([
+      ['space', ''],
       ['paragraph', 'paragraph'],
       ['text', 'paragraph'],
       ['space', ''],