preserve syntax highlighter language settings

This commit is contained in:
dousha 2025-01-06 00:19:45 +08:00
parent 1f2dde537c
commit b7a5a8dce2
3 changed files with 81 additions and 51 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

3
.idea/misc.xml generated
View File

@ -1,4 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="Black">
<option name="sdkName" value="wp-migrator" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="wp-migrator" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="wp-migrator" project-jdk-type="Python SDK" />
</project> </project>

129
main.py
View File

@ -6,36 +6,42 @@ from slugify import slugify
from markdownify import MarkdownConverter from markdownify import MarkdownConverter
import os import os
import re import re
import json
in_file = 'wpexport20241219.xml' in_file = 'wpexport20241219.xml'
class IframeConverter(MarkdownConverter): class IframeConverter(MarkdownConverter):
def convert_iframe(self, el, text, convert_as_inline): def convert_iframe(self, el, text, convert_as_inline):
src = el['src'] src = el['src']
if 'youtube' in src: if 'youtube' in src:
code = re.search(r'embed/([^?]+)', src).group(1) code = re.search(r'embed/([^?]+)', src).group(1)
return f"::: youtube {code}\n:::\n" return f"::: youtube {code}\n:::\n"
if 'steam' in src: if 'steam' in src:
code = re.search(r'\/([0-9]+)', src).group(1) code = re.search(r'\/([0-9]+)', src).group(1)
return f"::: steam {code}\n:::\n" return f"::: steam {code}\n:::\n"
return f"::: iframe {src}\n:::\n" return f"::: iframe {src}\n:::\n"
def mkdirp(path): def mkdirp(path):
if not os.path.exists(path): if not os.path.exists(path):
os.makedirs(path) os.makedirs(path)
def md(html, **options): def md(html, **options):
return IframeConverter(**options).convert(html) return IframeConverter(**options).convert(html)
def code_parser(el):
return el['data-language'][0] if el.has_attr('data-language') else 'text'
mkdirp('output') mkdirp('output')
with open(in_file, 'r', encoding='utf-8') as f: with open(in_file, 'r', encoding='utf-8') as f:
raw_content = f.read() raw_content = f.read()
soup = BeautifulSoup(raw_content, 'xml') soup = BeautifulSoup(raw_content, 'xml')
@ -43,43 +49,64 @@ items = soup.find_all('item')
print(f"{len(items)} items found") print(f"{len(items)} items found")
for item in items: for item in items:
post_type_raw = item.find('wp:post_type', recursive=False) post_type_raw = item.find('wp:post_type', recursive=False)
if post_type_raw is None: if post_type_raw is None:
continue continue
post_type = post_type_raw.contents[0] post_type = post_type_raw.contents[0]
if post_type != 'page' and post_type != 'post': if post_type != 'page' and post_type != 'post':
continue continue
post_id = item.find('wp:post_id', recursive=False).contents[0] post_id = item.find('wp:post_id', recursive=False).contents[0]
title = item.find('title', recursive=False).contents[0] title = item.find('title', recursive=False).contents[0]
creator = item.find('dc:creator', recursive=False).contents[0] creator = item.find('dc:creator', recursive=False).contents[0]
date = item.find('wp:post_date', recursive=False).contents[0] date = item.find('wp:post_date', recursive=False).contents[0]
status = item.find('wp:status', recursive=False).contents[0] status = item.find('wp:status', recursive=False).contents[0]
tags = item.findAll('category', { 'domain': 'post_tag' }) tags = item.findAll('category', {'domain': 'post_tag'})
tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
categories = item.findAll('category', { 'domain': 'category' }) categories = item.findAll('category', {'domain': 'category'})
categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories))) categories = '\n'.join(
content = item.find('content:encoded', recursive=False).contents[0] list(map(lambda x: ' - ' + x.contents[0], categories)))
content = re.sub(r'<!-- \/?wp:.+?\s*-->', '', content) content = item.find('content:encoded', recursive=False).contents[0]
content = re.sub(r'<!--more-->', '<p class="more">:::more:::</p>', content) content = re.sub(r'<!-- /?wp:((?!syntaxhighlighter).)*\s*-->', '', content)
content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()]) content = re.sub(r'<!--more-->', '<p class="more">:::more:::</p>', content)
content = md(content).strip() + '\n' lines = [s for s in content.splitlines()]
frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\npublished: {'yes' if status == 'publish' else 'no'}\n---\n\n" for i in range(len(lines)):
line = lines[i]
if line.startswith('<!-- wp:syntaxhighlighter/code'):
possibleParams = line[30:-4].strip()
if possibleParams == '':
continue
params = json.loads(possibleParams)
nextline = lines[i + 1]
if 'language' in params:
nextline = nextline[
:4] + f" data-language={params['language']} " + nextline[
4:]
if 'highlightLines' in params:
nextline = nextline[
:4] + f" data-highlight={params['highlightLines']} " + nextline[
4:]
lines[i + 1] = nextline
title_slug = slugify(title) content = '\n'.join(lines)
if len(title_slug) < 1: content = md(content, code_language_callback=code_parser).strip() + '\n'
title_slug = 'untitled' frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'true' if status == 'draft' else 'false'}\npublished: {'true' if status == 'publish' else 'false'}\n---\n\n"
output_folder = 'output/' + str(date)[:4] + '/' + str(date)[5:7] + '/'
mkdirp(output_folder)
filename_candidate = output_folder + title_slug + ".md"
filename_disambiguation = 1
while os.path.exists(filename_candidate): title_slug = slugify(title)
filename_candidate = "output/" + title_slug + "_(" + str(filename_disambiguation) + ").md" if len(title_slug) < 1:
filename_disambiguation = filename_disambiguation + 1 title_slug = 'untitled'
with open(filename_candidate, 'w', encoding='utf-8') as f: output_folder = 'output/' + str(date)[:4] + '/' + str(date)[5:7] + '/'
f.write(frontmatter) mkdirp(output_folder)
f.write(content) filename_candidate = output_folder + title_slug + ".md"
filename_disambiguation = 1
while os.path.exists(filename_candidate):
filename_candidate = "output/" + title_slug + "_(" + str(
filename_disambiguation) + ").md"
filename_disambiguation = filename_disambiguation + 1
with open(filename_candidate, 'w', encoding='utf-8') as f:
f.write(frontmatter)
f.write(content)