From 20f77f4339e3bd0ebdf17f024365407f599a4eb1 Mon Sep 17 00:00:00 2001 From: dousha Date: Fri, 27 Dec 2024 22:36:16 +0800 Subject: [PATCH] init --- .gitignore | 3 + .idea/.gitignore | 8 +++ .../inspectionProfiles/profiles_settings.xml | 6 ++ .idea/misc.xml | 4 ++ .idea/modules.xml | 8 +++ .idea/vcs.xml | 6 ++ .idea/wp-migrator.iml | 10 +++ main.py | 62 +++++++++++++++++++ requirements.txt | 8 +++ 9 files changed, 115 insertions(+) create mode 100644 .gitignore create mode 100644 .idea/.gitignore create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/wp-migrator.iml create mode 100644 main.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f535e21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/venv/ +/output/ + diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..35410ca --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..cbe9f1e --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..c2ef794 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/wp-migrator.iml b/.idea/wp-migrator.iml new file mode 100644 index 0000000..043f345 --- /dev/null +++ b/.idea/wp-migrator.iml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..798f068 --- /dev/null +++ b/main.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 +# encoding: utf-8 + +from bs4 import BeautifulSoup +from slugify import slugify +from markdownify import markdownify as md +import os +import re + +in_file = 'wpexport20241219.xml' + +if not os.path.exists('output'): + os.makedirs('output') + +with open(in_file, 'r', encoding='utf-8') as f: + raw_content = f.read() + +soup = BeautifulSoup(raw_content, 'xml') + +items = soup.find_all('item') +print(f"{len(items)} items found") + +for item in items: + post_type_raw = item.find('wp:post_type', recursive=False) + if post_type_raw is None: + continue + + post_type = post_type_raw.contents[0] + if post_type != 'page' and post_type != 'post': + continue + + post_id = item.find('wp:post_id', recursive=False).contents[0] + title = item.find('title', recursive=False).contents[0] + creator = item.find('dc:creator', recursive=False).contents[0] + date = item.find('wp:post_date', recursive=False).contents[0] + status = item.find('wp:status', recursive=False).contents[0] + tags = item.findAll('category', { 'domain': 'post_tag' }) + tags = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], tags))) + categories = item.findAll('category', { 'domain': 'category' }) + categories = '\n'.join(list(map(lambda x: ' - ' + x.contents[0], categories))) + content = item.find('content:encoded', recursive=False).contents[0] + content = re.sub(r'', '', content) + content = re.sub(r'', '

:::more:::

', content) + content = '\n'.join([s.strip() for s in content.splitlines() if s.strip()]) + content = md(content).strip() + frontmatter = f"---\nlayout: {post_type}\ntitle: \"{title}\"\ncreator: \"{creator}\"\ndate: {date}\ncategories:\n{categories}\ntags:\n{tags}\ndraft: {'yes' if status == 'draft' else 'no'}\n---\n\n" + + title_slug = slugify(title) + if len(title_slug) < 1: + title_slug = 'untitled' + + filename_candidate = "output/" + title_slug + ".md" + filename_disambiguation = 1 + + while os.path.exists(filename_candidate): + filename_candidate = "output/" + title_slug + "_(" + str(filename_disambiguation) + ").md" + filename_disambiguation = filename_disambiguation + 1 + + with open(filename_candidate, 'w', encoding='utf-8') as f: + f.write(frontmatter) + f.write(content) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..23b41b0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.12.3 +lxml==5.3.0 +markdownify==0.14.1 +python-slugify==8.0.4 +six==1.17.0 +soupsieve==2.6 +text-unidecode==1.3 +