#!/usr/bin/env python3 # encoding: utf-8 from bs4 import BeautifulSoup from slugify import slugify from markdownify import MarkdownConverter import os import re import json in_file = 'wpexport20241219.xml' class IframeConverter(MarkdownConverter): def convert_iframe(self, el, text, convert_as_inline): src = el['src'] if 'youtube' in src: code = re.search(r'embed/([^?]+)', src).group(1) return f"::: youtube {code}\n:::\n" if 'steam' in src: code = re.search(r'\/([0-9]+)', src).group(1) return f"::: steam {code}\n:::\n" return f"::: iframe {src}\n:::\n" def mkdirp(path): if not os.path.exists(path): os.makedirs(path) def md(html, **options): return IframeConverter(**options).convert(html) def code_parser(el): if not el.has_attr('data-language'): return 'text' lang = el['data-language'] if lang == 'jscript': return 'js' return lang def find_all_matches(pattern, string, group=0): pat = re.compile(pattern) pos = 0 out = [] while m := pat.search(string, pos): pos = m.start() + 1 out.append(m[group]) return out def fix_math_content(content): return content.replace(r'\_', '_').replace(r'\*', '*') def fix_math(content): math_regex = re.compile(r'(\\[\[\(]|\${1,2}[^0-9])(.+?)(\${1,2}|\\[$\)\]])') pos = 0 out = '' while m := math_regex.search(content, pos): out = out + content[pos:m.start()] out = out + m.group(1) + fix_math_content(m.group(2)) + m.group(3) pos = m.end() return out + content[pos:] def fix_short_code_content(tag, params, content): if tag == 'c' or tag == 'cpp' or tag == 'python' or tag == 'csharp' or tag == 'java' or tag == 'javascript' or tag == 'typescript' or tag == 'bash' or tag == 'html' or tag == 'css' or tag == 'php' or tag == 'sql' or tag == 'json' or tag == 'xml' or tag == 'yaml' or tag == 'markdown' or tag == 'text' or tag == 'plaintext': return f"```{tag}\n{content}\n```" elif tag == 'latex': return f"\({content}\)" elif tag == 'graphviz': return f"::: graphviz\n{content}\n:::\n" elif tag == 'abcjs' or tag == 'abc': return f"::: abc\n{content}\n:::\n" else: return None def fix_short_code(content): shortcode_regex = re.compile(r'\[([a-zA-Z0-9]+)([^\]]*)\](.+?)\[\/\1\]', re.DOTALL) pos = 0 out = '' while m := shortcode_regex.search(content, pos): out = out + content[pos:m.start()] fixed_content = fix_short_code_content(m.group(1), m.group(2), m.group(3)) if fixed_content is None: out = out + m.group(0) else: out = out + fixed_content pos = m.end() return out + content[pos:] def fix_nesting_code_blocks(content): lines = content.splitlines() out = [] for i in range(0, len(lines)): line = lines[i] if line.startswith('```'): if i + 1 < len(lines) and lines[i + 1].startswith('```'): continue out.append(line) return '\n'.join(out) def fix_code_block_content(content): return content.replace(r'\_', '_').replace(r'\*', '*') def trim_code_blocks(content): codeblock_regex = re.compile(r'```(.+?)```', re.DOTALL) pos = 0 out = '' while m := codeblock_regex.search(content, pos): out = out + content[pos:m.start()].strip() parts = re.split(r'[\n\r ]', m.group(1), 1) out = out + '\n\n```' + parts[0].strip() + '\n' + fix_code_block_content(parts[1]).strip() + '\n```\n\n' pos = m.end() return out + content[pos:].strip() mkdirp('output') with open(in_file, 'r', encoding='utf-8') as f: raw_content = f.read() soup = BeautifulSoup(raw_content, 'xml') items = soup.find_all('item') print(f"{len(items)} items found") for item in items: post_type_raw = item.find('wp:post_type', recursive=False) if post_type_raw is None: continue post_type = post_type_raw.contents[0] if post_type != 'page' and post_type != 'post': continue post_id = item.find('wp:post_id', recursive=False).contents[0] title = item.find('title', recursive=False).contents[0] creator = item.find('dc:creator', recursive=False).contents[0] date = item.find('wp:post_date', recursive=False).contents[0] status = item.find('wp:status', recursive=False).contents[0] tags = item.findAll('category', {'domain': 'post_tag'}) tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) categories = item.findAll('category', {'domain': 'category'}) categories = '\n'.join( list(map(lambda x: ' - ' + x.contents[0], categories))) content = item.find('content:encoded', recursive=False).contents[0] content = re.sub(r'', '', content) content = re.sub(r'', '
:::more:::
', content) lines = [s for s in content.splitlines()] for i in range(len(lines)): line = lines[i] if line.startswith('