refactor MarkdownParser

2021-11-27 12:54:06 +01:00 · 2021-11-27 12:54:06 +01:00 · 134d666480
commit 134d666480
parent 87e053f99d
2 changed files with 358 additions and 331 deletions
--- a/src/core/MarkdownParser.js
+++ b/src/core/MarkdownParser.js
@ -9,361 +9,389 @@
 * @flow
 */

-export default class MarkdownParser {
-  static nonWithespace = /\S|$/;
-  static expArticle = /(?:\n|^)\s*?(?=\#)|(?<=(?:\n|^)\s*?(?=\#).*?)\n/;
+let parseMText = () => {};

-  constructor(opt) {
-    this.parseLinks = opt && opt.parseLinks || false;
-    this.tabWidth = opt && opt.tabWidth || 4;
-    this.newlineBreaksArticles = opt && opt.newlineBreaksArticles || true;
+class MString {
+  constructor(text, start) {
+    this.txt = text;
+    this.iter = start || 0;
  }

-  parse(text: string) {
-    return this.parseText(text, 0, 0)[0];
+  nextChar() {
+    this.iter += 1;
+    return this.txt[this.iter];
  }

-  parseText(text, headingLevel, start) {
-    let mdArray = [];
-    let iter = start;
-    while (iter < text.length) {
-      const [aMdArray, newIter] = this.parseSection(
-        text, iter, headingLevel,
-      );
-      iter = newIter;
-      mdArray = mdArray.concat(aMdArray);
-      // either heading hit or article end
-      const chr = text[iter];
-      if (chr === '#') {
-        let subLvl = 0;
-        for (;
-          iter + subLvl <= text.length && text[iter + subLvl] === '#';
-          subLvl += 1
-        ) {}
-        if (subLvl <= headingLevel || headingLevel === 6) {
-          // end of article
-          // encountered title with same headingLevel or lower
-          break;
-        } else {
-          // child article
-          let lineEnd = text.indexOf('\n', iter);
-          if (lineEnd === -1) lineEnd = text.length;
-          const title = text.slice(iter + subLvl, lineEnd).trimLeft();
-          subLvl = Math.min(subLvl, 6);
-          const [subMdArray, newIter] = this.parseText(
-            text, subLvl, lineEnd + 1,
-          );
-          mdArray.push(['a', subLvl, title, subMdArray]);
-          iter = newIter;
-        }
-      } else {
+  done() {
+    return (this.iter >= this.txt.length);
+  }
+
+  moveForward() {
+    this.iter += 1;
+    return (this.iter < this.txt.length);
+  }
+
+  setIter(iter) {
+    this.iter = iter;
+  }
+
+  getChar() {
+    return this.txt[this.iter];
+  }
+
+  slice(start, end) {
+    return this.txt.slice(start, end || this.iter);
+  }
+
+  has(str) {
+    return this.txt.startsWith(str, this.iter);
+  }
+
+  move(cnt) {
+    this.iter += cnt;
+    return (this.iter < this.txt.length);
+  }
+
+  skipSpaces(skipNewlines = false) {
+    for (;this.iter < this.txt.length; this.iter += 1) {
+      const chr = this.txt[this.iter];
+      if (chr !== ' ' && chr !== '\t' && (!skipNewlines || chr !== '\n')) {
        break;
      }
    }
-
-    return [mdArray, iter];
  }

-  static stoppingCondition(text: string, iter: number) {
-    const chr = text[iter];
-    if (chr === '\n'
-      || chr === '#'
-    ) {
-      return true;
-    }
-    return false;
+  countRepeatingCharacters() {
+    const chr = this.getChar();
+    let newIter = this.iter + 1;
+    for (;newIter < this.txt.length && this.txt[newIter] === chr;
+      newIter += 1
+    );
+    return newIter - this.iter;
  }

-  /*
-   * parses Articles (contains paragraphs, code-blocks, numeration, etc.)
-   * @param text string of text
-   * @param start number of position in text where to start
-   * @param headingLevel the number of heading headingLevels we are in
-   * @param indent ndentation that should be considered
-   * returns when encountering heading of <= headingLevel (iter is at # position)
-   *   or heading-cancel with three spaces (iter is past newlines)
-   *   or ident is smaller than given
-   */
-  parseSection(
-    text: string,
-    start: number,
-    headingLevel = 0,
-    indent = 0,
-  ) {
-    let iter = start;
-    const mdArray = [];
-    let pArray = [];
-    let paraStart = iter;
-    let lineNr = 0;
-
-    const  addParagraph = (start, end) => {
-      /*
-      let paraText = text.slice(start, end);
-      mdArray.push(['p', paraText]);
-      */
-      mdArray.push(['p', pArray]);
-      pArray = [];
+  moveToNextLine() {
+    const lineEnd = this.txt.indexOf('\n', this.iter);
+    if (lineEnd === -1) {
+      this.iter = this.txt.length;
+    } else {
+      this.iter = lineEnd + 1;
    }
-
-    while (true) {
-      if (iter >= text.length) {
-        if (paraStart < text.length) {
-          addParagraph(paraStart, text.length);
-        }
-        break;
-      }
-
-      const paraLineStart = iter;
-      lineNr += 1;
-
-      /*
-       * act on indent
-       */
-      let curIndent;
-      [curIndent, iter] = this.getIndent(text, iter);
-      if (curIndent < indent && lineNr > 1) {
-        if (paraLineStart - 1 > paraStart) {
-          addParagraph(paraStart, paraLineStart - 1);
-        }
-        iter = paraLineStart;
-        break;
-      }
-
-      const chr = text[iter];
-
-      /*
-       * unordered list
-       */
-      let isUnorderedList = false;
-      let isOrderedList = false;
-      if (chr === '-') {
-        isUnorderedList = true;
-        iter += 1;
-      }
-
-      /*
-       * ordered list
-       */
-      if (!Number.isNaN(parseInt(chr))) {
-        let itern = iter + 1;
-        for(;!Number.isNaN(parseInt(text[itern])); itern += 1){}
-        if (text[itern] === '.' || text[itern] === ')') {
-          isOrderedList = true;
-          iter = itern + 1;
-        }
-      }
-
-      if (isUnorderedList || isOrderedList) {
-        if (paraLineStart - 1 > paraStart) {
-          addParagraph(paraStart, paraLineStart - 1);
-        }
-        let childMdArray;
-        [childMdArray, iter] = this.parseSection(
-          text,
-          iter,
-          headingLevel,
-          curIndent + 1,
-        );
-        childMdArray = ['-', childMdArray];
-        // lists are encapsuled
-        const capsule = (isUnorderedList) ? 'ul' : 'ol';
-        if (!mdArray.length || mdArray[mdArray.length - 1][0] !== capsule) {
-          mdArray.push([capsule, [childMdArray]]);
-        }
-        else {
-          mdArray[mdArray.length - 1][1].push(childMdArray);
-        }
-        paraStart = iter;
-        continue;
-      }
-
-      /*
-       * quotes
-       */
-      if (chr === '>' || chr === '<') {
-        if (paraLineStart - 1 > paraStart) {
-          addParagraph(paraStart, paraLineStart - 1);
-        }
-        const [qArray, newIter] = this.parseQuote(text, iter);
-        mdArray.push(qArray);
-        iter = newIter;
-        paraStart = iter;
-        continue;
-      }
-
-      /*
-       * code block
-       */
-      if (text.startsWith('```', iter)) {
-        if (paraLineStart - 1 > paraStart) {
-          addParagraph(paraStart, paraLineStart - 1);
-        }
-        const [cbArray, newIter] = this.parseCodeBlock(text, iter + 3);
-        mdArray.push(cbArray);
-        iter = newIter;
-        paraStart = iter;
-        continue;
-      }
-
-      /* other stopping conditions */
-      if (!indent && MarkdownParser.stoppingCondition(text, iter)) {
-        // encountered something - save paragraph
-        if (paraLineStart - 1 > paraStart) {
-          addParagraph(paraStart, paraLineStart - 1);
-        }
-        const chr = text[iter];
-        if (chr === '\n') {
-          iter = this.skipSpaces(text, iter + 1);
-          if (text[iter] === '\n') {
-            if (headingLevel && this.newlineBreaksArticles) {
-              break;
-            }
-            iter += 1;
-          }
-        } else if (chr === '#') {
-          break;
-        }
-        paraStart = iter;
-        continue;
-      }
-      // rest of line
-      const [pPArray, newIter] = this.parseParagraph(text, iter);
-      if (pPArray) {
-        pArray = pArray.concat(pPArray);
-      }
-      iter = newIter;
-    }
-
-    return [mdArray, iter];
  }

-  /*
-   * go to character in line
-   * return position of character or null if not found before next line
-   */
-  goToCharInLine(text, start, chr) {
-    let iter = start;
-    for (;iter < text.length && text[iter] !== '\n' && text[iter] !== chr; iter += 1) {}
-    if (text[iter] === chr) {
-      return iter;
-    }
-    return null;
+  getLine() {
+    const startLine = this.iter;
+    this.moveToNextLine();
+    return this.txt.slice(startLine, this.iter);
  }

-  /*
-   * Parse Paragraph till next newline
-   */
-  parseParagraph(text, start) {
-    const pArray = [];
-    let iter = start;
-    let pStart = start;
-    let pEnd = 0;
-    for (;iter < text.length && text[iter] !== '\n'; iter += 1) {
-      let newElem = null;
-      if (text[iter] === '`') {
-        const pos = this.goToCharInLine(text, iter + 1, '`');
-        if (pos) {
-          newElem = ['c', text.slice(iter + 1, pos)];
-          pEnd = iter;
-          iter = pos;
-        }
-      }
-      /*
-      else if (text.startsWith('**', iter) {
-      }
-      */
-      if (pEnd) {
-        if (pStart !== pEnd) {
-          pArray.push(text.slice(pStart, pEnd));
-        }
-        pStart = iter + 1;
-        pEnd = 0;
-        pArray.push(newElem);
-      }
-    }
-    iter += 1;
-    if (pStart !== iter) {
-      pArray.push(text.slice(pStart, iter));
-    }
-    return [pArray, iter];
-  }
-
-  /*
-   * get indentation of line
-   * @param text
-   * @param start integer position of line start of indent to check
-   * @return integer of indentation
-   */
-  getIndent(text: string, start: number) {
-    let iter = start;
+  getIndent(tabWidth) {
    let indent = 0;
-    while (iter < text.length) {
-      const chr = text[iter];
+    while (this.iter < this.txt.length) {
+      const chr = this.getChar();
      if (chr === '\t') {
-        indent += this.tabWidth;
+        indent += tabWidth;
      } else if (chr === ' ') {
        indent += 1;
      } else {
        break;
      }
-      iter += 1;
+      this.iter += 1;
    }
-    return [indent, iter];
+    return indent;
  }

-  /*
-   * parse Code Block
-   * start is first character after the initializing ```
-   * we just parse till the ending occures
-   */
-  parseCodeBlock(text, start) {
-    let iter = this.skipSpaces(text, start, false);
-    if (text[iter] === '\n') {
-      iter += 1;
+  goToCharInLine(chr) {
+    let { iter } = this;
+    for (;
+      iter < this.txt.length && this.txt[iter] !== '\n'
+        && this.txt[iter] !== chr;
+      iter += 1
+    );
+    if (this.txt[iter] === chr) {
+      this.iter = iter;
+      return iter;
    }
-    const cbStart = iter;
-    while (true) {
-      if (iter >= text.length) {
-        return [['cb', text.slice(cbStart)], iter];
-      }
-      iter = this.skipSpaces(text, iter, true);
-      if (text.startsWith('```', iter)) {
-        const nextIter = iter + 3;
-        return [['cb', text.slice(cbStart, iter)], nextIter];
-      }
-      for (;iter < text.length && text[iter] !== '\n'; iter += 1) {}
-    }
-  }
-
-  /*
-   * parse quote
-   */
-  parseQuote(text, start) {
-    // either '<' or '>'
-    const quoteChar = text[start];
-    let iter = start;
-    let quoteText = '';
-    while(true) {
-      if (iter >= text.length || text[iter] !== quoteChar) {
-        break;
-      }
-      iter += 1;
-      const startLine = iter;
-      for (;iter < text.length && text[iter] !== '\n'; iter += 1) {}
-      iter += 1;
-      quoteText += text.slice(startLine, iter);
-    }
-    return [[quoteChar, this.parseText(quoteText, 0, 0)[0]], iter];
-  }
-
-  skipSpaces(text: string, start: number, skipNewlines = false) {
-    let iter = start;
-    for (;iter < text.length; iter += 1) {
-      const char = text[iter];
-      if (char !== ' ' && char !== '\t' && (!skipNewlines || char !== '\n')) {
-        break;
-      }
-    }
-    return iter;
+    return false;
  }
 }
+
+/*
+ * Parse Paragraph till next newline
+ */
+function parseMParagraph(text, opts) {
+  const pArray = [];
+  let pStart = text.iter;
+  let pEnd = 0;
+  while (!text.done()) {
+    const chr = text.getChar();
+    let newElem = null;
+    if (chr === '`') {
+      const oldPos = text.iter;
+      text.moveForward();
+      if (text.goToCharInLine('`')) {
+        newElem = ['c', text.slice(oldPos + 1)];
+        pEnd = oldPos;
+      }
+    }
+    /*
+    else if (text.startsWith('**', iter) {
+    }
+    */
+    if (pEnd) {
+      if (pStart !== pEnd) {
+        pArray.push(text.slice(pStart, pEnd));
+      }
+      pStart = text.iter + 1;
+      pEnd = 0;
+      pArray.push(newElem);
+    }
+    text.moveForward();
+    if (chr === '\n') {
+      break;
+    }
+  }
+  if (pStart !== text.iter) {
+    pArray.push(text.slice(pStart));
+  }
+  return pArray;
+}
+
+/*
+ * parse Code Block
+ * start is first character after the initializing ```
+ * we just parse till the ending occures
+ */
+function parseCodeBlock(text) {
+  text.skipSpaces(false);
+  if (text.getChar === '\n') {
+    text.moveForward();
+  }
+  const cbStart = text.iter;
+  while (!text.done()) {
+    text.skipSpaces(true);
+    if (text.has('```')) {
+      const elem = ['cb', text.slice(cbStart)];
+      text.move(3);
+      return elem;
+    }
+    text.moveToNextLine();
+  }
+  const cbText = text.slice(cbStart);
+  text.move(3);
+  return ['cb', cbText];
+}
+
+/*
+ * parse quote
+ */
+function parseQuote(text, opts) {
+  // either '<' or '>'
+  const quoteChar = text.getChar();
+  let quoteText = '';
+  while (text.getChar() === quoteChar && text.moveForward()) {
+    const line = text.getLine();
+    quoteText += line;
+  }
+  const mQuoteText = new MString(quoteText);
+  return [quoteChar, parseMText(mQuoteText, opts, 0)];
+}
+
+/*
+ * parses Section (contains paragraphs, lists, etc. but no headings or quotes)
+ * @param text MString
+ * @param headingLevel the number of heading headingLevels we are in
+ * @param indent ndentation that should be considered (when inside list)
+ * returns when encountering heading of <= headingLevel (iter is at # position)
+ *   or heading-cancel with three spaces (iter is past newlines)
+ *   or ident is smaller than given
+ */
+function parseMSection(
+  text: string,
+  opts: Object,
+  headingLevel,
+  indent,
+) {
+  const mdArray = [];
+  let pArray = [];
+  let lineNr = 0;
+
+  while (!text.done()) {
+    const paraLineStart = text.iter;
+    lineNr += 1;
+
+    // this also skips spaces
+    const curIndent = text.getIndent(opts.tabWidth);
+
+    /*
+     * act on indent
+     */
+    if (curIndent < indent && lineNr > 1) {
+      text.setIter(paraLineStart);
+      break;
+    }
+
+    const chr = text.getChar();
+
+    /*
+     * break on heading
+     */
+    if (!indent && chr === '#') {
+      break;
+    }
+
+    /*
+     * is unordered list
+     */
+    let isUnorderedList = false;
+    let isOrderedList = false;
+    if (chr === '-') {
+      isUnorderedList = true;
+      text.moveForward();
+    }
+
+    /*
+     * is ordered list
+     */
+    if (!Number.isNaN(parseInt(chr, 10))) {
+      let itern = text.iter + 1;
+      for (;!Number.isNaN(parseInt(text.txt[itern], 10)); itern += 1);
+      const achr = text.txt[itern];
+      if (achr === '.' || achr === ')') {
+        isOrderedList = true;
+        text.setIter(itern + 1);
+      }
+    }
+
+    let pushPArray = false;
+    let insertElem = null;
+
+    if (isUnorderedList || isOrderedList) {
+      /*
+       * parse lists
+       */
+      if (pArray.length) {
+        mdArray.push(['p', pArray]);
+        pArray = [];
+      }
+      let childMdArray;
+      childMdArray = parseMSection(
+        text,
+        opts,
+        headingLevel,
+        curIndent + 1,
+      );
+      childMdArray = ['-', childMdArray];
+      // lists are encapsuled
+      const capsule = (isUnorderedList) ? 'ul' : 'ol';
+      if (!mdArray.length || mdArray[mdArray.length - 1][0] !== capsule) {
+        mdArray.push([capsule, [childMdArray]]);
+      } else {
+        mdArray[mdArray.length - 1][1].push(childMdArray);
+      }
+    } else if (chr === '>' || chr === '<') {
+      /*
+       * quotes
+       */
+      pushPArray = true;
+      insertElem = parseQuote(text, opts);
+    } else if (text.has('```')) {
+      /*
+       * code block
+       */
+      pushPArray = true;
+      text.move(3);
+      insertElem = parseCodeBlock(text);
+    } else if (!indent && chr === '\n') {
+      /*
+       * break on multiple newlines
+       */
+      text.moveForward();
+      text.skipSpaces(false);
+      if (text.getChar() === '\n') {
+        if (headingLevel && opts.newlineBreaksArticles) {
+          break;
+        }
+        text.moveForward();
+      }
+      pushPArray = true;
+    } else {
+      /*
+       * ordinary text aka paragraph
+       */
+      const pPArray = parseMParagraph(text, opts);
+      if (pPArray) {
+        pArray = pArray.concat(pPArray);
+      }
+      continue;
+    }
+
+    if (pushPArray && pArray.length) {
+      mdArray.push(['p', pArray]);
+      pArray = [];
+    }
+
+    if (insertElem) {
+      mdArray.push(insertElem);
+    }
+  }
+
+  if (pArray.length) {
+    mdArray.push(['p', pArray]);
+  }
+
+  return mdArray;
+}
+
+parseMText = (text, opts, headingLevel) => {
+  let mdArray = [];
+  while (!text.done()) {
+    const aMdArray = parseMSection(
+      text, opts, headingLevel, 0,
+    );
+    mdArray = mdArray.concat(aMdArray);
+    // either heading hit or article end
+    const chr = text.getChar();
+    if (chr === '#') {
+      let subLvl = text.countRepeatingCharacters();
+      if (subLvl <= headingLevel || headingLevel === 6) {
+        // end of article
+        // encountered title with same headingLevel or lower
+        break;
+      } else {
+        // child article
+        text.move(subLvl);
+        const title = text.getLine();
+        subLvl = Math.min(subLvl, 6);
+        const subMdArray = parseMText(
+          text, opts, subLvl,
+        );
+        mdArray.push(['a', subLvl, title, subMdArray]);
+      }
+    } else {
+      break;
+    }
+  }
+
+  return mdArray;
+};
+
+function parseOpts(inOpts) {
+  const opts = {};
+  opts.parseLinks = (inOpts && inOpts.parseLinks) || false;
+  opts.tabWidth = (inOpts && inOpts.tabWidth) || 4;
+  opts.newlineBreaksArticles = (inOpts && inOpts.newlineBreaksArticles) || true;
+  return opts;
+}
+
+export function parseParagraph(text: string, inOpts) {
+  const opts = parseOpts(inOpts);
+  const mText = new MString(text);
+  return parseMParagraph(mText, opts);
+}
+
+export function parse(text: string, inOpts) {
+  const opts = parseOpts(inOpts);
+  const mText = new MString(text);
+  return parseMText(mText, opts, 0);
+}
--- a/utils/markdown-test/mdtest.js
+++ b/utils/markdown-test/mdtest.js
@ -2,16 +2,15 @@ import React, { useState } from 'react';
 import ReactDOM from 'react-dom';
 import { Parser } from 'commonmark';

-import MarkdownParser from '../../src/core/MarkdownParser';
+import { parse } from '../../src/core/MarkdownParser';

 import Markdown from './Markdown';

 const reader = new Parser({ smart: true });
-const a = new MarkdownParser();

 function parseText(text, setDuration, setCmDuration, setMd) {
  let startt = Date.now();
-  const arr = a.parse(text);
+  const arr = parse(text);
  setDuration(Date.now() - startt);
  startt = Date.now();
  reader.parse(text);