From b7a5a8dce2bb32bbe3ab61f28897d872b08db5ee Mon Sep 17 00:00:00 2001 From: dousha Date: Mon, 6 Jan 2025 00:19:45 +0800 Subject: [PATCH] preserve syntax highlighter language settings --- .DS_Store | Bin 0 -> 6148 bytes .idea/misc.xml | 3 ++ main.py | 129 ++++++++++++++++++++++++++++++------------------- 3 files changed, 81 insertions(+), 51 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..bb08a0ccdaea6b2cc8dde2a91d07c0fe3f9052e9 GIT binary patch literal 6148 zcmeHKu};G<5Pb&~RAR}%z=D)7s0dt!vf#dUyeDVv|LdKbr5>M0ZMGJ=1ysCue(?|ZN(;7VTwma&Vn~*Yr)(n&a}q1 zosYLx{xP08&yz!`7`J{i#eLrPW5BUXmt*Fh7W07Sh;2cfT*X3a?><`FAHj!?u>iIyty zi6NHG{xrtr5i3JWhs4K+#F=-G7YVboej36dxuIKUz!_*Vuy4&o&;M(FGM$h7W{S6* z0cYT!F(AFs?P$nN#k2Lt_VlbxsdrQrjjPn5&~ID<*wFvTNf@;8RDH(f5i3KjqWvMA P=of)Nh#P0%7Z~^eZ9htW literal 0 HcmV?d00001 diff --git a/.idea/misc.xml b/.idea/misc.xml index cbe9f1e..d6d2e87 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,7 @@ + + \ No newline at end of file diff --git a/main.py b/main.py index dc8296f..79caa85 100644 --- a/main.py +++ b/main.py @@ -6,36 +6,42 @@ from slugify import slugify from markdownify import MarkdownConverter import os import re +import json in_file = 'wpexport20241219.xml' + class IframeConverter(MarkdownConverter): - def convert_iframe(self, el, text, convert_as_inline): - src = el['src'] - if 'youtube' in src: - code = re.search(r'embed/([^?]+)', src).group(1) - return f"::: youtube {code}\n:::\n" - - if 'steam' in src: - code = re.search(r'\/([0-9]+)', src).group(1) - return f"::: steam {code}\n:::\n" - - return f"::: iframe {src}\n:::\n" + def convert_iframe(self, el, text, convert_as_inline): + src = el['src'] + if 'youtube' in src: + code = re.search(r'embed/([^?]+)', src).group(1) + return f"::: youtube {code}\n:::\n" + + if 'steam' in src: + code = re.search(r'\/([0-9]+)', src).group(1) + return f"::: steam {code}\n:::\n" + + return f"::: iframe {src}\n:::\n" def mkdirp(path): - if not os.path.exists(path): - os.makedirs(path) + if not os.path.exists(path): + os.makedirs(path) def md(html, **options): - return IframeConverter(**options).convert(html) + return IframeConverter(**options).convert(html) + + +def code_parser(el): + return el['data-language'][0] if el.has_attr('data-language') else 'text' mkdirp('output') with open(in_file, 'r', encoding='utf-8') as f: - raw_content = f.read() + raw_content = f.read() soup = BeautifulSoup(raw_content, 'xml') @@ -43,43 +49,64 @@ items = soup.find_all('item') print(f"{len(items)} items found") for item in items: - post_type_raw = item.find('wp:post_type', recursive=False) - if post_type_raw is None: - continue + post_type_raw = item.find('wp:post_type', recursive=False) + if post_type_raw is None: + continue - post_type = post_type_raw.contents[0] - if post_type != 'page' and post_type != 'post': - continue + post_type = post_type_raw.contents[0] + if post_type != 'page' and post_type != 'post': + continue - post_id = item.find('wp:post_id', recursive=False).contents[0] - title = item.find('title', recursive=False).contents[0] - creator = item.find('dc:creator', recursive=False).contents[0] - date = item.find('wp:post_date', recursive=False).contents[0] - status = item.find('wp:status', recursive=False).contents[0] - tags = item.findAll('category', { 'domain': 'post_tag' }) - tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) - categories = item.findAll('category', { 'domain': 'category' }) - categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories))) - content = item.find('content:encoded', recursive=False).contents[0] - content = re.sub(r'', '', content) - content = re.sub(r'', '

:::more:::

', content) - content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()]) - content = md(content).strip() + '\n' - frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\npublished: {'yes' if status == 'publish' else 'no'}\n---\n\n" + post_id = item.find('wp:post_id', recursive=False).contents[0] + title = item.find('title', recursive=False).contents[0] + creator = item.find('dc:creator', recursive=False).contents[0] + date = item.find('wp:post_date', recursive=False).contents[0] + status = item.find('wp:status', recursive=False).contents[0] + tags = item.findAll('category', {'domain': 'post_tag'}) + tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) + categories = item.findAll('category', {'domain': 'category'}) + categories = '\n'.join( + list(map(lambda x: ' - ' + x.contents[0], categories))) + content = item.find('content:encoded', recursive=False).contents[0] + content = re.sub(r'', '', content) + content = re.sub(r'', '

:::more:::

', content) + lines = [s for s in content.splitlines()] + for i in range(len(lines)): + line = lines[i] + if line.startswith('