From b7a5a8dce2bb32bbe3ab61f28897d872b08db5ee Mon Sep 17 00:00:00 2001
From: dousha <lijiahao34@live.com>
Date: Mon, 6 Jan 2025 00:19:45 +0800
Subject: [PATCH] preserve syntax highlighter language settings

---
 .DS_Store      | Bin 0 -> 6148 bytes
 .idea/misc.xml |   3 ++
 main.py        | 129 ++++++++++++++++++++++++++++++-------------------
 3 files changed, 81 insertions(+), 51 deletions(-)
 create mode 100644 .DS_Store
diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..bb08a0ccdaea6b2cc8dde2a91d07c0fe3f9052e9
GIT binary patch
literal 6148
zcmeHKu};G<5Pb&~RAR}%z=D)7s0d<V3so7I`2i^HP$_LnX+_!cKMec`{}kT&Tp%Gc
zLg-GqduRLZY(FWn4M2vQNd_bU`cy?xr$e;6b7(IH_lOc|46ww@O;J9Ki;hBnu}jx}
zh&k>t!vf#dUyeDVv|LdKbr5>M0ZMGJ=1ysCue(?|ZN(;7VTwma&Vn~*Yr)(n&a}q1
zosYLx{xP08&y<l%&bngGInJmT$Y?WMvd)&PFPiJXJL|mKn(Lari!<O1I0MdrGw{U#
z&uo?ck)c~>z!`7`J{i#eLrPW5BUXmt*Fh7W07Sh;2cfT*X3a?><`FAHj!?u>iIyty
zi6NHG{xrtr5i3JWhs4K+#F=-G7YVboej36dxuIKUz!_*Vuy4&o&;M(FGM$h7W{S6*
z0cYT!F(AFs?P$nN#k2Lt_VlbxsdrQrjjPn5&~ID<*wFvTNf@;8RDH(f5i3KjqWvMA
P=of)Nh#P0%7Z~^eZ9htW

literal 0
HcmV?d00001

diff --git a/.idea/misc.xml b/.idea/misc.xml
index cbe9f1e..d6d2e87 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
+  <component name="Black">
+    <option name="sdkName" value="wp-migrator" />
+  </component>
   <component name="ProjectRootManager" version="2" project-jdk-name="wp-migrator" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
diff --git a/main.py b/main.py
index dc8296f..79caa85 100644
--- a/main.py
+++ b/main.py
@@ -6,36 +6,42 @@ from slugify import slugify
 from markdownify import MarkdownConverter
 import os
 import re
+import json
 
 in_file = 'wpexport20241219.xml'
 
+
 class IframeConverter(MarkdownConverter):
-    def convert_iframe(self, el, text, convert_as_inline):
-        src = el['src']
-        if 'youtube' in src:
-            code = re.search(r'embed/([^?]+)', src).group(1)
-            return f"::: youtube {code}\n:::\n"
-        
-        if 'steam' in src:
-            code = re.search(r'\/([0-9]+)', src).group(1)
-            return f"::: steam {code}\n:::\n"
-            
-        return f"::: iframe {src}\n:::\n"
+	def convert_iframe(self, el, text, convert_as_inline):
+		src = el['src']
+		if 'youtube' in src:
+			code = re.search(r'embed/([^?]+)', src).group(1)
+			return f"::: youtube {code}\n:::\n"
+
+		if 'steam' in src:
+			code = re.search(r'\/([0-9]+)', src).group(1)
+			return f"::: steam {code}\n:::\n"
+
+		return f"::: iframe {src}\n:::\n"
 
 
 def mkdirp(path):
-    if not os.path.exists(path):
-        os.makedirs(path)
+	if not os.path.exists(path):
+		os.makedirs(path)
 
 
 def md(html, **options):
-    return IframeConverter(**options).convert(html)
+	return IframeConverter(**options).convert(html)
+
+
+def code_parser(el):
+	return el['data-language'][0] if el.has_attr('data-language') else 'text'
 
 
 mkdirp('output')
 
 with open(in_file, 'r', encoding='utf-8') as f:
-    raw_content = f.read()
+	raw_content = f.read()
 
 soup = BeautifulSoup(raw_content, 'xml')
 
@@ -43,43 +49,64 @@ items = soup.find_all('item')
 print(f"{len(items)} items found")
 
 for item in items:
-    post_type_raw = item.find('wp:post_type', recursive=False)
-    if post_type_raw is None:
-        continue
+	post_type_raw = item.find('wp:post_type', recursive=False)
+	if post_type_raw is None:
+		continue
 
-    post_type = post_type_raw.contents[0]
-    if post_type != 'page' and post_type != 'post':
-        continue
+	post_type = post_type_raw.contents[0]
+	if post_type != 'page' and post_type != 'post':
+		continue
 
-    post_id = item.find('wp:post_id', recursive=False).contents[0]
-    title = item.find('title', recursive=False).contents[0]
-    creator = item.find('dc:creator', recursive=False).contents[0]
-    date = item.find('wp:post_date', recursive=False).contents[0]
-    status = item.find('wp:status', recursive=False).contents[0]
-    tags = item.findAll('category', { 'domain': 'post_tag' })
-    tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
-    categories = item.findAll('category', { 'domain': 'category' })
-    categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories)))
-    content = item.find('content:encoded', recursive=False).contents[0]
-    content = re.sub(r'<!-- \/?wp:.+?\s*-->', '', content)
-    content = re.sub(r'<!--more-->', '<p class="more">:::more:::</p>', content)
-    content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()])
-    content = md(content).strip() + '\n'
-    frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\npublished: {'yes' if status == 'publish' else 'no'}\n---\n\n"
+	post_id = item.find('wp:post_id', recursive=False).contents[0]
+	title = item.find('title', recursive=False).contents[0]
+	creator = item.find('dc:creator', recursive=False).contents[0]
+	date = item.find('wp:post_date', recursive=False).contents[0]
+	status = item.find('wp:status', recursive=False).contents[0]
+	tags = item.findAll('category', {'domain': 'post_tag'})
+	tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
+	categories = item.findAll('category', {'domain': 'category'})
+	categories = '\n'.join(
+		list(map(lambda x: ' - ' + x.contents[0], categories)))
+	content = item.find('content:encoded', recursive=False).contents[0]
+	content = re.sub(r'<!-- /?wp:((?!syntaxhighlighter).)*\s*-->', '', content)
+	content = re.sub(r'<!--more-->', '<p class="more">:::more:::</p>', content)
+	lines = [s for s in content.splitlines()]
+	for i in range(len(lines)):
+		line = lines[i]
+		if line.startswith('<!-- wp:syntaxhighlighter/code'):
+			possibleParams = line[30:-4].strip()
+			if possibleParams == '':
+				continue
+			params = json.loads(possibleParams)
+			nextline = lines[i + 1]
+			if 'language' in params:
+				nextline = nextline[
+						   :4] + f" data-language={params['language']} " + nextline[
+																		   4:]
+			if 'highlightLines' in params:
+				nextline = nextline[
+						   :4] + f" data-highlight={params['highlightLines']} " + nextline[
+																				  4:]
+			lines[i + 1] = nextline
 
-    title_slug = slugify(title)
-    if len(title_slug) < 1:
-        title_slug = 'untitled'
-    
-    output_folder = 'output/' + str(date)[:4] + '/' + str(date)[5:7] + '/'
-    mkdirp(output_folder)
-    filename_candidate = output_folder + title_slug + ".md"
-    filename_disambiguation = 1
+	content = '\n'.join(lines)
+	content = md(content, code_language_callback=code_parser).strip() + '\n'
+	frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'true' if status == 'draft' else 'false'}\npublished: {'true' if status == 'publish' else 'false'}\n---\n\n"
 
-    while os.path.exists(filename_candidate):
-        filename_candidate = "output/" + title_slug + "_(" + str(filename_disambiguation) + ").md"
-        filename_disambiguation = filename_disambiguation + 1
-    
-    with open(filename_candidate, 'w', encoding='utf-8') as f:
-        f.write(frontmatter)
-        f.write(content)
+	title_slug = slugify(title)
+	if len(title_slug) < 1:
+		title_slug = 'untitled'
+
+	output_folder = 'output/' + str(date)[:4] + '/' + str(date)[5:7] + '/'
+	mkdirp(output_folder)
+	filename_candidate = output_folder + title_slug + ".md"
+	filename_disambiguation = 1
+
+	while os.path.exists(filename_candidate):
+		filename_candidate = "output/" + title_slug + "_(" + str(
+			filename_disambiguation) + ").md"
+		filename_disambiguation = filename_disambiguation + 1
+
+	with open(filename_candidate, 'w', encoding='utf-8') as f:
+		f.write(frontmatter)
+		f.write(content)