#!/usr/bin/env python3 # encoding: utf-8 from bs4 import BeautifulSoup from slugify import slugify from markdownify import markdownify as md import os import re in_file = 'wpexport20241219.xml' if not os.path.exists('output'): os.makedirs('output') with open(in_file, 'r', encoding='utf-8') as f: raw_content = f.read() soup = BeautifulSoup(raw_content, 'xml') items = soup.find_all('item') print(f"{len(items)} items found") for item in items: post_type_raw = item.find('wp:post_type', recursive=False) if post_type_raw is None: continue post_type = post_type_raw.contents[0] if post_type != 'page' and post_type != 'post': continue post_id = item.find('wp:post_id', recursive=False).contents[0] title = item.find('title', recursive=False).contents[0] creator = item.find('dc:creator', recursive=False).contents[0] date = item.find('wp:post_date', recursive=False).contents[0] status = item.find('wp:status', recursive=False).contents[0] tags = item.findAll('category', { 'domain': 'post_tag' }) tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) categories = item.findAll('category', { 'domain': 'category' }) categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories))) content = item.find('content:encoded', recursive=False).contents[0] content = re.sub(r'', '', content) content = re.sub(r'', '
:::more:::
', content) content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()]) content = md(content).strip() frontmatter = f"---\nlayout: {post_type}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\n---\n\n" title_slug = slugify(title) if len(title_slug) < 1: title_slug = 'untitled' filename_candidate = "output/" + title_slug + ".md" filename_disambiguation = 1 while os.path.exists(filename_candidate): filename_candidate = "output/" + title_slug + "_(" + str(filename_disambiguation) + ").md" filename_disambiguation = filename_disambiguation + 1 with open(filename_candidate, 'w', encoding='utf-8') as f: f.write(frontmatter) f.write(content)