refactor MarkdownParser

2021-11-27 12:54:06 +01:00 · 2021-11-27 12:54:06 +01:00 · 134d666480
commit 134d666480
parent 87e053f99d
2 changed files with 358 additions and 331 deletions
--- a/src/core/MarkdownParser.js
+++ b/src/core/MarkdownParser.js
@ -9,361 +9,389 @@
 * @flow
 */
-export default class MarkdownParser {
+let parseMText = () => {};
  static nonWithespace = /\S|$/;
  static expArticle = /(?:\n|^)\s*?(?=\#)|(?<=(?:\n|^)\s*?(?=\#).*?)\n/;
-  constructor(opt) {
+class MString {
-    this.parseLinks = opt && opt.parseLinks || false;
+  constructor(text, start) {
-    this.tabWidth = opt && opt.tabWidth || 4;
+    this.txt = text;
-    this.newlineBreaksArticles = opt && opt.newlineBreaksArticles || true;
+    this.iter = start || 0;
  }
-  parse(text: string) {
+  nextChar() {
-    return this.parseText(text, 0, 0)[0];
+    this.iter += 1;
    return this.txt[this.iter];
  }
-  parseText(text, headingLevel, start) {
+  done() {
-    let mdArray = [];
+    return (this.iter >= this.txt.length);
-    let iter = start;
+  }
-    while (iter < text.length) {
+
-      const [aMdArray, newIter] = this.parseSection(
+  moveForward() {
-        text, iter, headingLevel,
+    this.iter += 1;
-      );
+    return (this.iter < this.txt.length);
-      iter = newIter;
+  }
-      mdArray = mdArray.concat(aMdArray);
+
-      // either heading hit or article end
+  setIter(iter) {
-      const chr = text[iter];
+    this.iter = iter;
-      if (chr === '#') {
+  }
-        let subLvl = 0;
+
-        for (;
+  getChar() {
-          iter + subLvl <= text.length && text[iter + subLvl] === '#';
+    return this.txt[this.iter];
-          subLvl += 1
+  }
-        ) {}
+
-        if (subLvl <= headingLevel || headingLevel === 6) {
+  slice(start, end) {
-          // end of article
+    return this.txt.slice(start, end || this.iter);
-          // encountered title with same headingLevel or lower
+  }
-          break;
+
-        } else {
+  has(str) {
-          // child article
+    return this.txt.startsWith(str, this.iter);
-          let lineEnd = text.indexOf('\n', iter);
+  }
-          if (lineEnd === -1) lineEnd = text.length;
+
-          const title = text.slice(iter + subLvl, lineEnd).trimLeft();
+  move(cnt) {
-          subLvl = Math.min(subLvl, 6);
+    this.iter += cnt;
-          const [subMdArray, newIter] = this.parseText(
+    return (this.iter < this.txt.length);
-            text, subLvl, lineEnd + 1,
+  }
-          );
+
-          mdArray.push(['a', subLvl, title, subMdArray]);
+  skipSpaces(skipNewlines = false) {
-          iter = newIter;
+    for (;this.iter < this.txt.length; this.iter += 1) {
-        }
+      const chr = this.txt[this.iter];
-      } else {
+      if (chr !== ' ' && chr !== '\t' && (!skipNewlines || chr !== '\n')) {
        break;
      }
    }
    return [mdArray, iter];
  }
-  static stoppingCondition(text: string, iter: number) {
+  countRepeatingCharacters() {
-    const chr = text[iter];
+    const chr = this.getChar();
-    if (chr === '\n'
+    let newIter = this.iter + 1;
-      || chr === '#'
+    for (;newIter < this.txt.length && this.txt[newIter] === chr;
-    ) {
+      newIter += 1
-      return true;
+    );
-    }
+    return newIter - this.iter;
    return false;
  }
-  /*
+  moveToNextLine() {
-   * parses Articles (contains paragraphs, code-blocks, numeration, etc.)
+    const lineEnd = this.txt.indexOf('\n', this.iter);
-   * @param text string of text
+    if (lineEnd === -1) {
-   * @param start number of position in text where to start
+      this.iter = this.txt.length;
-   * @param headingLevel the number of heading headingLevels we are in
+    } else {
-   * @param indent ndentation that should be considered
+      this.iter = lineEnd + 1;
   * returns when encountering heading of <= headingLevel (iter is at # position)
   *   or heading-cancel with three spaces (iter is past newlines)
   *   or ident is smaller than given
   */
  parseSection(
    text: string,
    start: number,
    headingLevel = 0,
    indent = 0,
  ) {
    let iter = start;
    const mdArray = [];
    let pArray = [];
    let paraStart = iter;
    let lineNr = 0;
    const  addParagraph = (start, end) => {
      /*
      let paraText = text.slice(start, end);
      mdArray.push(['p', paraText]);
      */
      mdArray.push(['p', pArray]);
      pArray = [];
    }
    while (true) {
      if (iter >= text.length) {
        if (paraStart < text.length) {
          addParagraph(paraStart, text.length);
        }
        break;
      }
      const paraLineStart = iter;
      lineNr += 1;
      /*
       * act on indent
       */
      let curIndent;
      [curIndent, iter] = this.getIndent(text, iter);
      if (curIndent < indent && lineNr > 1) {
        if (paraLineStart - 1 > paraStart) {
          addParagraph(paraStart, paraLineStart - 1);
        }
        iter = paraLineStart;
        break;
      }
      const chr = text[iter];
      /*
       * unordered list
       */
      let isUnorderedList = false;
      let isOrderedList = false;
      if (chr === '-') {
        isUnorderedList = true;
        iter += 1;
      }
      /*
       * ordered list
       */
      if (!Number.isNaN(parseInt(chr))) {
        let itern = iter + 1;
        for(;!Number.isNaN(parseInt(text[itern])); itern += 1){}
        if (text[itern] === '.' || text[itern] === ')') {
          isOrderedList = true;
          iter = itern + 1;
        }
      }
      if (isUnorderedList || isOrderedList) {
        if (paraLineStart - 1 > paraStart) {
          addParagraph(paraStart, paraLineStart - 1);
        }
        let childMdArray;
        [childMdArray, iter] = this.parseSection(
          text,
          iter,
          headingLevel,
          curIndent + 1,
        );
        childMdArray = ['-', childMdArray];
        // lists are encapsuled
        const capsule = (isUnorderedList) ? 'ul' : 'ol';
        if (!mdArray.length || mdArray[mdArray.length - 1][0] !== capsule) {
          mdArray.push([capsule, [childMdArray]]);
        }
        else {
          mdArray[mdArray.length - 1][1].push(childMdArray);
        }
        paraStart = iter;
        continue;
      }
      /*
       * quotes
       */
      if (chr === '>' || chr === '<') {
        if (paraLineStart - 1 > paraStart) {
          addParagraph(paraStart, paraLineStart - 1);
        }
        const [qArray, newIter] = this.parseQuote(text, iter);
        mdArray.push(qArray);
        iter = newIter;
        paraStart = iter;
        continue;
      }
      /*
       * code block
       */
      if (text.startsWith('```', iter)) {
        if (paraLineStart - 1 > paraStart) {
          addParagraph(paraStart, paraLineStart - 1);
        }
        const [cbArray, newIter] = this.parseCodeBlock(text, iter + 3);
        mdArray.push(cbArray);
        iter = newIter;
        paraStart = iter;
        continue;
      }
      /* other stopping conditions */
      if (!indent && MarkdownParser.stoppingCondition(text, iter)) {
        // encountered something - save paragraph
        if (paraLineStart - 1 > paraStart) {
          addParagraph(paraStart, paraLineStart - 1);
        }
        const chr = text[iter];
        if (chr === '\n') {
          iter = this.skipSpaces(text, iter + 1);
          if (text[iter] === '\n') {
            if (headingLevel && this.newlineBreaksArticles) {
              break;
            }
            iter += 1;
          }
        } else if (chr === '#') {
          break;
        }
        paraStart = iter;
        continue;
      }
      // rest of line
      const [pPArray, newIter] = this.parseParagraph(text, iter);
      if (pPArray) {
        pArray = pArray.concat(pPArray);
      }
      iter = newIter;
    }
    return [mdArray, iter];
  }
-  /*
+  getLine() {
-   * go to character in line
+    const startLine = this.iter;
-   * return position of character or null if not found before next line
+    this.moveToNextLine();
-   */
+    return this.txt.slice(startLine, this.iter);
  goToCharInLine(text, start, chr) {
    let iter = start;
    for (;iter < text.length && text[iter] !== '\n' && text[iter] !== chr; iter += 1) {}
    if (text[iter] === chr) {
      return iter;
    }
    return null;
  }
-  /*
+  getIndent(tabWidth) {
   * Parse Paragraph till next newline
   */
  parseParagraph(text, start) {
    const pArray = [];
    let iter = start;
    let pStart = start;
    let pEnd = 0;
    for (;iter < text.length && text[iter] !== '\n'; iter += 1) {
      let newElem = null;
      if (text[iter] === '`') {
        const pos = this.goToCharInLine(text, iter + 1, '`');
        if (pos) {
          newElem = ['c', text.slice(iter + 1, pos)];
          pEnd = iter;
          iter = pos;
        }
      }
      /*
      else if (text.startsWith('**', iter) {
      }
      */
      if (pEnd) {
        if (pStart !== pEnd) {
          pArray.push(text.slice(pStart, pEnd));
        }
        pStart = iter + 1;
        pEnd = 0;
        pArray.push(newElem);
      }
    }
    iter += 1;
    if (pStart !== iter) {
      pArray.push(text.slice(pStart, iter));
    }
    return [pArray, iter];
  }
  /*
   * get indentation of line
   * @param text
   * @param start integer position of line start of indent to check
   * @return integer of indentation
   */
  getIndent(text: string, start: number) {
    let iter = start;
    let indent = 0;
-    while (iter < text.length) {
+    while (this.iter < this.txt.length) {
-      const chr = text[iter];
+      const chr = this.getChar();
      if (chr === '\t') {
-        indent += this.tabWidth;
+        indent += tabWidth;
      } else if (chr === ' ') {
        indent += 1;
      } else {
        break;
      }
-      iter += 1;
+      this.iter += 1;
    }
-    return [indent, iter];
+    return indent;
  }
-  /*
+  goToCharInLine(chr) {
-   * parse Code Block
+    let { iter } = this;
-   * start is first character after the initializing ```
+    for (;
-   * we just parse till the ending occures
+      iter < this.txt.length && this.txt[iter] !== '\n'
-   */
+        && this.txt[iter] !== chr;
-  parseCodeBlock(text, start) {
+      iter += 1
-    let iter = this.skipSpaces(text, start, false);
+    );
-    if (text[iter] === '\n') {
+    if (this.txt[iter] === chr) {
-      iter += 1;
+      this.iter = iter;
      return iter;
    }
-    const cbStart = iter;
+    return false;
    while (true) {
      if (iter >= text.length) {
        return [['cb', text.slice(cbStart)], iter];
      }
      iter = this.skipSpaces(text, iter, true);
      if (text.startsWith('```', iter)) {
        const nextIter = iter + 3;
        return [['cb', text.slice(cbStart, iter)], nextIter];
      }
      for (;iter < text.length && text[iter] !== '\n'; iter += 1) {}
    }
  }
  /*
   * parse quote
   */
  parseQuote(text, start) {
    // either '<' or '>'
    const quoteChar = text[start];
    let iter = start;
    let quoteText = '';
    while(true) {
      if (iter >= text.length || text[iter] !== quoteChar) {
        break;
      }
      iter += 1;
      const startLine = iter;
      for (;iter < text.length && text[iter] !== '\n'; iter += 1) {}
      iter += 1;
      quoteText += text.slice(startLine, iter);
    }
    return [[quoteChar, this.parseText(quoteText, 0, 0)[0]], iter];
  }
  skipSpaces(text: string, start: number, skipNewlines = false) {
    let iter = start;
    for (;iter < text.length; iter += 1) {
      const char = text[iter];
      if (char !== ' ' && char !== '\t' && (!skipNewlines || char !== '\n')) {
        break;
      }
    }
    return iter;
  }
 }
 /*
 * Parse Paragraph till next newline
 */
 function parseMParagraph(text, opts) {
  const pArray = [];
  let pStart = text.iter;
  let pEnd = 0;
  while (!text.done()) {
    const chr = text.getChar();
    let newElem = null;
    if (chr === '`') {
      const oldPos = text.iter;
      text.moveForward();
      if (text.goToCharInLine('`')) {
        newElem = ['c', text.slice(oldPos + 1)];
        pEnd = oldPos;
      }
    }
    /*
    else if (text.startsWith('**', iter) {
    }
    */
    if (pEnd) {
      if (pStart !== pEnd) {
        pArray.push(text.slice(pStart, pEnd));
      }
      pStart = text.iter + 1;
      pEnd = 0;
      pArray.push(newElem);
    }
    text.moveForward();
    if (chr === '\n') {
      break;
    }
  }
  if (pStart !== text.iter) {
    pArray.push(text.slice(pStart));
  }
  return pArray;
 }
 /*
 * parse Code Block
 * start is first character after the initializing ```
 * we just parse till the ending occures
 */
 function parseCodeBlock(text) {
  text.skipSpaces(false);
  if (text.getChar === '\n') {
    text.moveForward();
  }
  const cbStart = text.iter;
  while (!text.done()) {
    text.skipSpaces(true);
    if (text.has('```')) {
      const elem = ['cb', text.slice(cbStart)];
      text.move(3);
      return elem;
    }
    text.moveToNextLine();
  }
  const cbText = text.slice(cbStart);
  text.move(3);
  return ['cb', cbText];
 }
 /*
 * parse quote
 */
 function parseQuote(text, opts) {
  // either '<' or '>'
  const quoteChar = text.getChar();
  let quoteText = '';
  while (text.getChar() === quoteChar && text.moveForward()) {
    const line = text.getLine();
    quoteText += line;
  }
  const mQuoteText = new MString(quoteText);
  return [quoteChar, parseMText(mQuoteText, opts, 0)];
 }
 /*
 * parses Section (contains paragraphs, lists, etc. but no headings or quotes)
 * @param text MString
 * @param headingLevel the number of heading headingLevels we are in
 * @param indent ndentation that should be considered (when inside list)
 * returns when encountering heading of <= headingLevel (iter is at # position)
 *   or heading-cancel with three spaces (iter is past newlines)
 *   or ident is smaller than given
 */
 function parseMSection(
  text: string,
  opts: Object,
  headingLevel,
  indent,
 ) {
  const mdArray = [];
  let pArray = [];
  let lineNr = 0;
  while (!text.done()) {
    const paraLineStart = text.iter;
    lineNr += 1;
    // this also skips spaces
    const curIndent = text.getIndent(opts.tabWidth);
    /*
     * act on indent
     */
    if (curIndent < indent && lineNr > 1) {
      text.setIter(paraLineStart);
      break;
    }
    const chr = text.getChar();
    /*
     * break on heading
     */
    if (!indent && chr === '#') {
      break;
    }
    /*
     * is unordered list
     */
    let isUnorderedList = false;
    let isOrderedList = false;
    if (chr === '-') {
      isUnorderedList = true;
      text.moveForward();
    }
    /*
     * is ordered list
     */
    if (!Number.isNaN(parseInt(chr, 10))) {
      let itern = text.iter + 1;
      for (;!Number.isNaN(parseInt(text.txt[itern], 10)); itern += 1);
      const achr = text.txt[itern];
      if (achr === '.' || achr === ')') {
        isOrderedList = true;
        text.setIter(itern + 1);
      }
    }
    let pushPArray = false;
    let insertElem = null;
    if (isUnorderedList || isOrderedList) {
      /*
       * parse lists
       */
      if (pArray.length) {
        mdArray.push(['p', pArray]);
        pArray = [];
      }
      let childMdArray;
      childMdArray = parseMSection(
        text,
        opts,
        headingLevel,
        curIndent + 1,
      );
      childMdArray = ['-', childMdArray];
      // lists are encapsuled
      const capsule = (isUnorderedList) ? 'ul' : 'ol';
      if (!mdArray.length || mdArray[mdArray.length - 1][0] !== capsule) {
        mdArray.push([capsule, [childMdArray]]);
      } else {
        mdArray[mdArray.length - 1][1].push(childMdArray);
      }
    } else if (chr === '>' || chr === '<') {
      /*
       * quotes
       */
      pushPArray = true;
      insertElem = parseQuote(text, opts);
    } else if (text.has('```')) {
      /*
       * code block
       */
      pushPArray = true;
      text.move(3);
      insertElem = parseCodeBlock(text);
    } else if (!indent && chr === '\n') {
      /*
       * break on multiple newlines
       */
      text.moveForward();
      text.skipSpaces(false);
      if (text.getChar() === '\n') {
        if (headingLevel && opts.newlineBreaksArticles) {
          break;
        }
        text.moveForward();
      }
      pushPArray = true;
    } else {
      /*
       * ordinary text aka paragraph
       */
      const pPArray = parseMParagraph(text, opts);
      if (pPArray) {
        pArray = pArray.concat(pPArray);
      }
      continue;
    }
    if (pushPArray && pArray.length) {
      mdArray.push(['p', pArray]);
      pArray = [];
    }
    if (insertElem) {
      mdArray.push(insertElem);
    }
  }
  if (pArray.length) {
    mdArray.push(['p', pArray]);
  }
  return mdArray;
 }
 parseMText = (text, opts, headingLevel) => {
  let mdArray = [];
  while (!text.done()) {
    const aMdArray = parseMSection(
      text, opts, headingLevel, 0,
    );
    mdArray = mdArray.concat(aMdArray);
    // either heading hit or article end
    const chr = text.getChar();
    if (chr === '#') {
      let subLvl = text.countRepeatingCharacters();
      if (subLvl <= headingLevel || headingLevel === 6) {
        // end of article
        // encountered title with same headingLevel or lower
        break;
      } else {
        // child article
        text.move(subLvl);
        const title = text.getLine();
        subLvl = Math.min(subLvl, 6);
        const subMdArray = parseMText(
          text, opts, subLvl,
        );
        mdArray.push(['a', subLvl, title, subMdArray]);
      }
    } else {
      break;
    }
  }
  return mdArray;
 };
 function parseOpts(inOpts) {
  const opts = {};
  opts.parseLinks = (inOpts && inOpts.parseLinks) || false;
  opts.tabWidth = (inOpts && inOpts.tabWidth) || 4;
  opts.newlineBreaksArticles = (inOpts && inOpts.newlineBreaksArticles) || true;
  return opts;
 }
 export function parseParagraph(text: string, inOpts) {
  const opts = parseOpts(inOpts);
  const mText = new MString(text);
  return parseMParagraph(mText, opts);
 }
 export function parse(text: string, inOpts) {
  const opts = parseOpts(inOpts);
  const mText = new MString(text);
  return parseMText(mText, opts, 0);
 }
--- a/utils/markdown-test/mdtest.js
+++ b/utils/markdown-test/mdtest.js
@ -2,16 +2,15 @@ import React, { useState } from 'react';
 import ReactDOM from 'react-dom';
 import { Parser } from 'commonmark';
-import MarkdownParser from '../../src/core/MarkdownParser';
+import { parse } from '../../src/core/MarkdownParser';
 import Markdown from './Markdown';
 const reader = new Parser({ smart: true });
 const a = new MarkdownParser();
 function parseText(text, setDuration, setCmDuration, setMd) {
  let startt = Date.now();
-  const arr = a.parse(text);
+  const arr = parse(text);
  setDuration(Date.now() - startt);
  startt = Date.now();
  reader.parse(text);