diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000..bb08a0c
Binary files /dev/null and b/.DS_Store differ
diff --git a/.idea/misc.xml b/.idea/misc.xml
index cbe9f1e..d6d2e87 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,7 @@
+
+
+
\ No newline at end of file
diff --git a/main.py b/main.py
index dc8296f..79caa85 100644
--- a/main.py
+++ b/main.py
@@ -6,36 +6,42 @@ from slugify import slugify
from markdownify import MarkdownConverter
import os
import re
+import json
in_file = 'wpexport20241219.xml'
+
class IframeConverter(MarkdownConverter):
- def convert_iframe(self, el, text, convert_as_inline):
- src = el['src']
- if 'youtube' in src:
- code = re.search(r'embed/([^?]+)', src).group(1)
- return f"::: youtube {code}\n:::\n"
-
- if 'steam' in src:
- code = re.search(r'\/([0-9]+)', src).group(1)
- return f"::: steam {code}\n:::\n"
-
- return f"::: iframe {src}\n:::\n"
+ def convert_iframe(self, el, text, convert_as_inline):
+ src = el['src']
+ if 'youtube' in src:
+ code = re.search(r'embed/([^?]+)', src).group(1)
+ return f"::: youtube {code}\n:::\n"
+
+ if 'steam' in src:
+ code = re.search(r'\/([0-9]+)', src).group(1)
+ return f"::: steam {code}\n:::\n"
+
+ return f"::: iframe {src}\n:::\n"
def mkdirp(path):
- if not os.path.exists(path):
- os.makedirs(path)
+ if not os.path.exists(path):
+ os.makedirs(path)
def md(html, **options):
- return IframeConverter(**options).convert(html)
+ return IframeConverter(**options).convert(html)
+
+
+def code_parser(el):
+ return el['data-language'][0] if el.has_attr('data-language') else 'text'
mkdirp('output')
with open(in_file, 'r', encoding='utf-8') as f:
- raw_content = f.read()
+ raw_content = f.read()
soup = BeautifulSoup(raw_content, 'xml')
@@ -43,43 +49,64 @@ items = soup.find_all('item')
print(f"{len(items)} items found")
for item in items:
- post_type_raw = item.find('wp:post_type', recursive=False)
- if post_type_raw is None:
- continue
+ post_type_raw = item.find('wp:post_type', recursive=False)
+ if post_type_raw is None:
+ continue
- post_type = post_type_raw.contents[0]
- if post_type != 'page' and post_type != 'post':
- continue
+ post_type = post_type_raw.contents[0]
+ if post_type != 'page' and post_type != 'post':
+ continue
- post_id = item.find('wp:post_id', recursive=False).contents[0]
- title = item.find('title', recursive=False).contents[0]
- creator = item.find('dc:creator', recursive=False).contents[0]
- date = item.find('wp:post_date', recursive=False).contents[0]
- status = item.find('wp:status', recursive=False).contents[0]
- tags = item.findAll('category', { 'domain': 'post_tag' })
- tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
- categories = item.findAll('category', { 'domain': 'category' })
- categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories)))
- content = item.find('content:encoded', recursive=False).contents[0]
- content = re.sub(r'', '', content)
- content = re.sub(r'', '
:::more:::
', content)
- content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()])
- content = md(content).strip() + '\n'
- frontmatter = f"---\nlayout: {post_type}\nid: {post_id}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\npublished: {'yes' if status == 'publish' else 'no'}\n---\n\n"
+ post_id = item.find('wp:post_id', recursive=False).contents[0]
+ title = item.find('title', recursive=False).contents[0]
+ creator = item.find('dc:creator', recursive=False).contents[0]
+ date = item.find('wp:post_date', recursive=False).contents[0]
+ status = item.find('wp:status', recursive=False).contents[0]
+ tags = item.findAll('category', {'domain': 'post_tag'})
+ tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags)))
+ categories = item.findAll('category', {'domain': 'category'})
+ categories = '\n'.join(
+ list(map(lambda x: ' - ' + x.contents[0], categories)))
+ content = item.find('content:encoded', recursive=False).contents[0]
+ content = re.sub(r'', '', content)
+ content = re.sub(r'', ':::more:::
', content)
+ lines = [s for s in content.splitlines()]
+ for i in range(len(lines)):
+ line = lines[i]
+ if line.startswith('