63 lines
2.3 KiB
Python
63 lines
2.3 KiB
Python
#!/usr/bin/env python3
|
|
# encoding: utf-8
|
|
|
|
from bs4 import BeautifulSoup
|
|
from slugify import slugify
|
|
from markdownify import markdownify as md
|
|
import os
|
|
import re
|
|
|
|
in_file = 'wpexport20241219.xml'
|
|
|
|
if not os.path.exists('output'):
|
|
os.makedirs('output')
|
|
|
|
with open(in_file, 'r', encoding='utf-8') as f:
|
|
raw_content = f.read()
|
|
|
|
soup = BeautifulSoup(raw_content, 'xml')
|
|
|
|
items = soup.find_all('item')
|
|
print(f"{len(items)} items found")
|
|
|
|
for item in items:
|
|
post_type_raw = item.find('wp:post_type', recursive=False)
|
|
if post_type_raw is None:
|
|
continue
|
|
|
|
post_type = post_type_raw.contents[0]
|
|
if post_type != 'page' and post_type != 'post':
|
|
continue
|
|
|
|
post_id = item.find('wp:post_id', recursive=False).contents[0]
|
|
title = item.find('title', recursive=False).contents[0]
|
|
creator = item.find('dc:creator', recursive=False).contents[0]
|
|
date = item.find('wp:post_date', recursive=False).contents[0]
|
|
status = item.find('wp:status', recursive=False).contents[0]
|
|
tags = item.findAll('category', { 'domain': 'post_tag' })
|
|
tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
|
|
categories = item.findAll('category', { 'domain': 'category' })
|
|
categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories)))
|
|
content = item.find('content:encoded', recursive=False).contents[0]
|
|
content = re.sub(r'<!-- \/?wp:.+?\s*-->', '', content)
|
|
content = re.sub(r'<!--more-->', '<p class="more">:::more:::</p>', content)
|
|
content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()])
|
|
content = md(content).strip()
|
|
frontmatter = f"---\nlayout: {post_type}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\n---\n\n"
|
|
|
|
title_slug = slugify(title)
|
|
if len(title_slug) < 1:
|
|
title_slug = 'untitled'
|
|
|
|
filename_candidate = "output/" + title_slug + ".md"
|
|
filename_disambiguation = 1
|
|
|
|
while os.path.exists(filename_candidate):
|
|
filename_candidate = "output/" + title_slug + "_(" + str(filename_disambiguation) + ").md"
|
|
filename_disambiguation = filename_disambiguation + 1
|
|
|
|
with open(filename_candidate, 'w', encoding='utf-8') as f:
|
|
f.write(frontmatter)
|
|
f.write(content)
|
|
|