116 lines
3.1 KiB
JavaScript
116 lines
3.1 KiB
JavaScript
import { unescape as ldUnescape } from 'lodash'
|
|
|
|
import { getTagName } from './utility.service.js'
|
|
|
|
/**
|
|
* This is a not-so-tiny purpose-built HTML parser/processor. This parses html
|
|
* and converts it into a tree structure representing tag openers/closers and
|
|
* children.
|
|
*
|
|
* Structure follows this pattern: [opener, [...children], closer] except root
|
|
* node which is just [...children]. Text nodes can only be within children and
|
|
* are represented as strings.
|
|
*
|
|
* Intended use is to convert HTML structure and then recursively iterate over it
|
|
* most likely using a map. Very useful for dynamically rendering html replacing
|
|
* tags with JSX elements in a render function.
|
|
*
|
|
* known issue: doesn't handle CDATA so CDATA might not work well
|
|
* known issue: doesn't handle HTML comments
|
|
*
|
|
* @param {Object} input - input data
|
|
* @return {string} processed html
|
|
*/
|
|
export const convertHtmlToTree = (html = '') => {
|
|
// Elements that are implicitly self-closing
|
|
// https://developer.mozilla.org/en-US/docs/Glossary/empty_element
|
|
const emptyElements = new Set([
|
|
'area',
|
|
'base',
|
|
'br',
|
|
'col',
|
|
'embed',
|
|
'hr',
|
|
'img',
|
|
'input',
|
|
'keygen',
|
|
'link',
|
|
'meta',
|
|
'param',
|
|
'source',
|
|
'track',
|
|
'wbr',
|
|
])
|
|
// TODO For future - also parse HTML5 multi-source components?
|
|
|
|
const buffer = [] // Current output buffer
|
|
const levels = [['', buffer]] // How deep we are in tags and which tags were there
|
|
let textBuffer = '' // Current line content
|
|
let tagBuffer = null // Current tag buffer, if null = we are not currently reading a tag
|
|
|
|
const getCurrentBuffer = () => {
|
|
return levels[levels.length - 1][1]
|
|
}
|
|
|
|
const flushText = () => {
|
|
// Processes current line buffer, adds it to output buffer and clears line buffer
|
|
if (textBuffer === '') return
|
|
getCurrentBuffer().push(textBuffer)
|
|
textBuffer = ''
|
|
}
|
|
|
|
const handleSelfClosing = (tag) => {
|
|
getCurrentBuffer().push([tag])
|
|
}
|
|
|
|
const handleOpen = (tag) => {
|
|
const curBuf = getCurrentBuffer()
|
|
const newLevel = [ldUnescape(tag), []]
|
|
levels.push(newLevel)
|
|
curBuf.push(newLevel)
|
|
}
|
|
|
|
const handleClose = (tag) => {
|
|
const currentTag = levels[levels.length - 1]
|
|
if (getTagName(levels[levels.length - 1][0]) === getTagName(tag)) {
|
|
currentTag.push(tag)
|
|
levels.pop()
|
|
} else {
|
|
getCurrentBuffer().push(tag)
|
|
}
|
|
}
|
|
|
|
for (let i = 0; i < html.length; i++) {
|
|
const char = html[i]
|
|
if (char === '<' && tagBuffer === null) {
|
|
flushText()
|
|
tagBuffer = char
|
|
} else if (char !== '>' && tagBuffer !== null) {
|
|
tagBuffer += char
|
|
} else if (char === '>' && tagBuffer !== null) {
|
|
tagBuffer += char
|
|
const tagFull = tagBuffer
|
|
tagBuffer = null
|
|
const tagName = getTagName(tagFull)
|
|
if (tagFull[1] === '/') {
|
|
handleClose(tagFull)
|
|
} else if (
|
|
emptyElements.has(tagName) ||
|
|
tagFull[tagFull.length - 2] === '/'
|
|
) {
|
|
// self-closing
|
|
handleSelfClosing(tagFull)
|
|
} else {
|
|
handleOpen(tagFull)
|
|
}
|
|
} else {
|
|
textBuffer += char
|
|
}
|
|
}
|
|
if (tagBuffer) {
|
|
textBuffer += tagBuffer
|
|
}
|
|
|
|
flushText()
|
|
return buffer
|
|
}
|