#!/usr/bin/env python3 # encoding: utf-8 from bs4 import BeautifulSoup from slugify import slugify from markdownify import MarkdownConverter import os import re import json in_file = 'wpexport20241219.xml' class IframeConverter(MarkdownConverter): def convert_iframe(self, el, text, convert_as_inline): src = el['src'] if 'youtube' in src: code = re.search(r'embed/([^?]+)', src).group(1) return f"::: youtube {code}\n:::\n" if 'steam' in src: code = re.search(r'\/([0-9]+)', src).group(1) return f"::: steam {code}\n:::\n" return f"::: iframe {src}\n:::\n" def mkdirp(path): if not os.path.exists(path): os.makedirs(path) def md(html, **options): return IframeConverter(**options).convert(html) def code_parser(el): return el['data-language'][0] if el.has_attr('data-language') else 'text' mkdirp('output') with open(in_file, 'r', encoding='utf-8') as f: raw_content = f.read() soup = BeautifulSoup(raw_content, 'xml') items = soup.find_all('item') print(f"{len(items)} items found") for item in items: post_type_raw = item.find('wp:post_type', recursive=False) if post_type_raw is None: continue post_type = post_type_raw.contents[0] if post_type != 'page' and post_type != 'post': continue post_id = item.find('wp:post_id', recursive=False).contents[0] title = item.find('title', recursive=False).contents[0] creator = item.find('dc:creator', recursive=False).contents[0] date = item.find('wp:post_date', recursive=False).contents[0] status = item.find('wp:status', recursive=False).contents[0] tags = item.findAll('category', {'domain': 'post_tag'}) tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) categories = item.findAll('category', {'domain': 'category'}) categories = '\n'.join( list(map(lambda x: ' - ' + x.contents[0], categories))) content = item.find('content:encoded', recursive=False).contents[0] content = re.sub(r'', '', content) content = re.sub(r'', '

:::more:::

', content) lines = [s for s in content.splitlines()] for i in range(len(lines)): line = lines[i] if line.startswith('