wp2md-python/main.py

#!/usr/bin/env python3
# encoding: utf-8

from bs4 import BeautifulSoup
from slugify import slugify
from markdownify import MarkdownConverter
import os
import re
import json

in_file = 'wpexport20241219.xml'


class IframeConverter(MarkdownConverter):
	def convert_iframe(self, el, text, convert_as_inline):
		src = el['src']
		if 'youtube' in src:
			code = re.search(r'embed/([^?]+)', src).group(1)
			return f"::: youtube {code}\n:::\n"

		if 'steam' in src:
			code = re.search(r'\/([0-9]+)', src).group(1)
			return f"::: steam {code}\n:::\n"

		return f"::: iframe {src}\n:::\n"


def mkdirp(path):
	if not os.path.exists(path):
		os.makedirs(path)


def md(html, **options):
	return IframeConverter(**options).convert(html)


def code_parser(el):
	if not el.has_attr('data-language'):
		return 'text'

	lang = el['data-language']
	if lang == 'jscript':
		return 'js'

	return lang


def find_all_matches(pattern, string, group=0):
	pat = re.compile(pattern)
	pos = 0
	out = []
	while m := pat.search(string, pos):
		pos = m.start() + 1
		out.append(m[group])
	return out

def fix_math_content(content):
	return content.replace(r'\_', '_').replace(r'\*', '*')


def fix_math(content):
	math_regex = re.compile(r'(\\[\[\(]|\${1,2}[^0-9])(.+?)(\${1,2}|\\[$\)\]])')
	pos = 0
	out = ''
	while m := math_regex.search(content, pos):
		out = out + content[pos:m.start()]
		out = out + m.group(1) + fix_math_content(m.group(2)) + m.group(3)
		pos = m.end()
	return out + content[pos:]


def fix_short_code_content(tag, params, content):
	if tag == 'c' or tag == 'cpp' or tag == 'python' or tag == 'csharp' or tag == 'java' or tag == 'javascript' or tag == 'typescript' or tag == 'bash' or tag == 'html' or tag == 'css' or tag == 'php' or tag == 'sql' or tag == 'json' or tag == 'xml' or tag == 'yaml' or tag == 'markdown' or tag == 'text' or tag == 'plaintext':
		return f"```{tag}\n{content}\n```"
	elif tag == 'latex':
		return f"\({content}\)"
	elif tag == 'graphviz':
		return f"::: graphviz\n{content}\n:::\n"
	elif tag == 'abcjs' or tag == 'abc':
		return f"::: abc\n{content}\n:::\n"
	elif tag == 'kbd' or tag == 'key':
		return f"<k content=\"{content}\">"
	elif tag == 'tr':
		if params.strip() == '':
			return f"<tx content=\"{content}\">"
		else:
			return f"<tx {params.strip()} content=\"{content}\">"
	elif tag == 'cl':
		if params.strip() == '':
			return f"<cl content=\"{content}\">"
		else:
			return f"<cl {params.strip()} content=\"{content}\">"
	elif tag == 'btn' or tag == 'ctl':
		return f"<btn {params.strip()} content=\"{content}\">"
	else:
		return None


def fix_short_code(content):
	shortcode_regex = re.compile(r'\[([a-zA-Z0-9]+)([^\]]*)\](.+?)\[\/\1\]', re.DOTALL)
	pos = 0
	out = ''
	while m := shortcode_regex.search(content, pos):
		out = out + content[pos:m.start()]
		fixed_content = fix_short_code_content(m.group(1), m.group(2), m.group(3))
		if fixed_content is None:
			out = out + m.group(0)
		else:
			out = out + fixed_content
		pos = m.end()
	return out + content[pos:]


def fix_nesting_code_blocks(content):
	lines = content.splitlines()
	out = []
	for i in range(0, len(lines)):
		line = lines[i]
		if line.startswith('```'):
			if i + 1 < len(lines) and lines[i + 1].startswith('```'):
				continue
		out.append(line)
	return '\n'.join(out)


def fix_code_block_content(content):
	return content.replace(r'\_', '_').replace(r'\*', '*')


def trim_code_blocks(content):
	codeblock_regex = re.compile(r'```(.+?)```', re.DOTALL)
	pos = 0
	out = ''
	while m := codeblock_regex.search(content, pos):
		out = out + content[pos:m.start()].strip()
		parts = re.split(r'[\n\r ]', m.group(1), 1)
		out = out + '\n\n```' + parts[0].strip() + '\n' + fix_code_block_content(parts[1]).strip() + '\n```\n\n'
		pos = m.end()
	return out + content[pos:].strip()


mkdirp('output')

with open(in_file, 'r', encoding='utf-8') as f:
	raw_content = f.read()

soup = BeautifulSoup(raw_content, 'xml')

items = soup.find_all('item')
print(f"{len(items)} items found")

for item in items:
	post_type_raw = item.find('wp:post_type', recursive=False)
	if post_type_raw is None:
		continue

	post_type = post_type_raw.contents[0]
	if post_type != 'page' and post_type != 'post':
		continue

	post_id = item.find('wp:post_id', recursive=False).contents[0]
	title = item.find('title', recursive=False).contents[0]
	creator = item.find('dc:creator', recursive=False).contents[0]
	date = item.find('wp:post_date', recursive=False).contents[0]
	status = item.find('wp:status', recursive=False).contents[0]
	tags = item.findAll('category', {'domain': 'post_tag'})
	tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
	categories = item.findAll('category', {'domain': 'category'})
	categories = '\n'.join(
		list(map(lambda x: ' - ' + x.contents[0], categories)))
	content = item.find('content:encoded', recursive=False).contents[0]
	content = re.sub(r'<!-- /?wp:((?!syntaxhighlighter).)*\s*-->', '', content)
	content = re.sub(r'<!--more-->', '<p class="more">:::more:::</p>', content)
	lines = [s for s in content.splitlines()]
	for i in range(len(lines)):
		line = lines[i]
		if line.startswith('<!-- wp:syntaxhighlighter/code'):
			possibleParams = line[30:-4].strip()
			if possibleParams == '':
				continue
			params = json.loads(possibleParams)
			nextline = lines[i + 1]
			if 'language' in params:
				nextline = nextline[
						   :4] + f" data-language={params['language']} " + nextline[
																		   4:]
			if 'highlightLines' in params:
				nextline = nextline[
						   :4] + f" data-highlight={params['highlightLines']} " + nextline[
																				  4:]
			lines[i + 1] = nextline

	content = '\n'.join(lines)
	content = md(content, code_language_callback=code_parser).strip() + '\n'
	ref_regex = re.compile(r'\[ref](.+?)\[/ref]')
	refs = []
	stuff = find_all_matches(ref_regex, content, 1)
	if stuff:
		for thing in stuff:
			if not thing in refs:
				refs = refs + [thing]

		current = 0
		while m := ref_regex.search(content, current):
			thing = m.group(1)
			current = m.start() + 1
			try:
				idx = refs.index(thing) + 1
				content = content.replace(m.group(0), f"[^{idx}]")
			except ValueError:
				print(f"{thing} not found")
				continue

	if len(refs) > 0:
		ref_output = []
		for i in range(0, len(refs)):
			ref = refs[i]
			ref_output.append(f"[^{i + 1}]: {ref}")

		content = content + '\n' + '\n\n'.join(ref_output) + '\n'

	content = fix_short_code(content)
	content = fix_math(content)
	content = fix_nesting_code_blocks(content)
	content = trim_code_blocks(content)
	if not content.endswith('\n'):
		content = content + '\n'

	frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'true' if status == 'draft' else 'false'}\npublished: {'true' if status == 'publish' else 'false'}\n---\n\n"

	title_slug = slugify(title)
	if len(title_slug) < 1:
		title_slug = 'untitled'

	output_folder = 'output/' + str(date)[:4] + '/' + str(date)[5:7] + '/'
	mkdirp(output_folder)
	filename_candidate = output_folder + title_slug + ".md"
	filename_disambiguation = 1

	while os.path.exists(filename_candidate):
		filename_candidate = "output/" + title_slug + "_(" + str(
			filename_disambiguation) + ").md"
		filename_disambiguation = filename_disambiguation + 1

	with open(filename_candidate, 'w', encoding='utf-8') as f:
		f.write(frontmatter)
		f.write(content)