summaryrefslogtreecommitdiff
path: root/modules/blogpost_processor.py
diff options
context:
space:
mode:
authorBotond Hende <nettingman@gmail.com>2024-09-04 20:05:51 +0200
committerBotond Hende <nettingman@gmail.com>2024-09-04 20:05:51 +0200
commitbea80928f694671010bc99493d31879df7d42836 (patch)
treeafb71282df8f01ce4b2dd87a292febe7ad8e3537 /modules/blogpost_processor.py
initial commit
Diffstat (limited to 'modules/blogpost_processor.py')
-rw-r--r--modules/blogpost_processor.py228
1 files changed, 228 insertions, 0 deletions
diff --git a/modules/blogpost_processor.py b/modules/blogpost_processor.py
new file mode 100644
index 0000000..b29d91c
--- /dev/null
+++ b/modules/blogpost_processor.py
@@ -0,0 +1,228 @@
+import datetime
+import email.utils
+import json
+import os.path
+import random
+import urllib
+from typing import Optional
+
+import marko
+import marko.inline
+
+from .warko.strikethrough import StrikeThrough
+from .warko.headinginjector import get_heading_injector
+from .warko.extendedimage import get_image_renderer, ExtendedImageElement
+from .warko.newtablink import get_new_tab_links
+from ..config import Config
+
+CONTENT_FILE_NAME = "content.md"
+META_FILE_NAME = "meta.json"
+
+TITLE_KEY = "title"
+PUBLISH_DATE_KEY = "publish_date"
+TAGS_KEY = "tags"
+THUMBNAIL_KEY = "thumbnail"
+INTRO_KEY = "intro"
+
+INTRO_DESIRED_LENGTH = 320
+INTRO_MAX_EXTRA_LENGTH = 100
+
+posts_cache = []
+
+
+def get_posts_from_cache() -> list('Post'):
+ if len(posts_cache) == 0:
+ raise Exception("Getting posts list from cache while it is still empty.")
+
+ return posts_cache.copy()
+
+
+class Post:
+ PARSER = marko.Markdown(renderer=marko.HTMLRenderer
+ , extensions=[
+ StrikeThrough
+ , get_heading_injector("> ")
+ , get_image_renderer(["blog.wazul.moe"])
+ , get_new_tab_links(["blog.wazul.moe"])
+ ]
+ )
+
+ def __init__(self, path: str):
+ self.path = path
+ self.name = os.path.basename(path)
+
+ with open(os.path.join(path, CONTENT_FILE_NAME)) as f:
+ self.content = Post.PARSER.parse(f.read())
+
+ self.html = Post.PARSER.render(self.content)
+
+ with open(os.path.join(path, META_FILE_NAME)) as f:
+ self.meta_data = json.load(f)
+
+ self.subpages = []
+ self.extra_files = []
+ for elem in os.listdir(path):
+ if elem != CONTENT_FILE_NAME and elem != META_FILE_NAME:
+ if elem.endswith(".md"):
+ with open(os.path.join(path, elem)) as f:
+ subpage_content = Post.PARSER.parse(f.read())
+
+ self.subpages.append((os.path.basename(elem)[:-3], Post.PARSER.render(subpage_content)))
+ else:
+ self.extra_files.append(elem)
+
+ self.href = f"/posts/{self.get_publish_year()}/{self.name}"
+
+ if THUMBNAIL_KEY in self.meta_data:
+ self.thumbnail = self.meta_data[THUMBNAIL_KEY]
+ else:
+ self.thumbnail = Post._get_first_image_path(self.content)
+
+ self.thumbnail = f"{self.href}/{self.thumbnail}"
+
+ if INTRO_KEY in self.meta_data:
+ self.intro = self.meta_data[INTRO_KEY]
+ else:
+ intro_str = Post._extract_first_paragraph_text(self.content)
+ if len(intro_str) > 320:
+ intro_str = intro_str[:320] # cut to length
+ # first try to cut at the last period, if it's not too far...
+ last_dot_pos = intro_str.rfind(".")
+ if 320 - last_dot_pos > 100:
+ intro_str = intro_str[:last_dot_pos + 1]
+ else:
+ intro_str += "..." # If too far, just add more dots
+
+ self.intro = intro_str
+
+ def title(self) -> str:
+ return self.meta_data[TITLE_KEY]
+
+ def get_tags(self) -> str:
+ return " ".join(["<a href=\"{}/tags/{}.html\">{}</a>".format(Config.BLOG_ROOT_URL, tag, tag) for tag in
+ self.meta_data[TAGS_KEY]])
+
+ def get_link(self):
+ return urllib.parse.urljoin(Config.BLOG_ROOT_URL, self.href)
+
+ def get_publish_time(self) -> str:
+ return self.meta_data[PUBLISH_DATE_KEY]
+
+ def get_publish_time_rfc2822(self) -> str:
+ return email.utils.format_datetime(datetime.datetime.fromisoformat(self.get_publish_time() + " 12:00+02:00"))
+
+ def get_publish_year(self) -> str:
+ return self.meta_data[PUBLISH_DATE_KEY][0:4]
+
+ def get_fake_path(self) -> str:
+ return "~/posts/{}/{}".format(self.get_publish_year(), self.name)
+
+ def get_prompt(self, cmd: str) -> str:
+ return Config.get_prompt(self.get_fake_path(), cmd)
+
+ def get_cat_prompt(self, file: str) -> str:
+ return self.get_prompt(f"cat {file}")
+
+ def get_index_prompt(self) -> str:
+ return Config.get_prompt("~", f"head {self.get_fake_path()}/content")
+
+ def get_similar_posts(self) -> list['Post']:
+ ret_list = []
+
+ posts = get_posts_from_cache()
+ idx_self = posts.index(self)
+
+ # TODO move to config
+ MAX_SIMILAR_POST_COUNT = 5
+ POSTS_AROUND_DISTANCE = 5
+
+ # add the previous post
+ if idx_self < len(posts) - 1:
+ ret_list.append(posts.pop(idx_self + 1))
+
+ # TODO add some tagbased search when I have more content
+
+ # fallback: add random posts from around the current post
+ posts_around = posts[idx_self - POSTS_AROUND_DISTANCE:idx_self + POSTS_AROUND_DISTANCE]
+ posts_around.remove(self)
+
+ while len(posts_around) > 0 and len(ret_list) < MAX_SIMILAR_POST_COUNT:
+ rand_index = random.randint(0, len(posts_around) - 1)
+ ret_list.append(posts_around.pop(rand_index))
+
+ return ret_list
+
+ def get_similar_posts_ls(self) -> str:
+ # TODO fix
+ return self.generate_similar_posts_ls([self, self, self])
+
+ @staticmethod
+ def generate_similar_posts_ls(other_posts: list['Post']) -> str:
+ lines = ["total 0"]
+
+ for post in other_posts:
+ lines.append("lrwxrwxrwx 1 {} {} {} {} 11:11 '{}' -> {}".format( # TODO random time and fix filename escape
+ Config.BLOG_OWNER, Config.BLOG_OWNER, len(post.get_fake_path()), post.get_publish_time(), post.title(),
+ post.get_fake_path())
+ )
+
+ return "<br>".join(lines)
+
+ @staticmethod
+ def _extract_first_paragraph_text(root) -> str:
+ for child in root.children:
+ if isinstance(child, marko.block.Paragraph):
+ paragraph_str = ""
+ for part in child.children:
+ if isinstance(part, marko.inline.RawText) and isinstance(part.children, str):
+ paragraph_str += part.children
+ if isinstance(part, marko.inline.LineBreak):
+ if paragraph_str[-1:] != " ":
+ paragraph_str += " "
+ if isinstance(part, marko.inline.Link):
+ for part_child in part.children:
+ if isinstance(part_child, marko.inline.RawText) and isinstance(part_child.children, str):
+ paragraph_str += part_child.children
+
+ if len(paragraph_str) > INTRO_DESIRED_LENGTH:
+ paragraph_str = paragraph_str[:INTRO_DESIRED_LENGTH] # cut to length
+ # first try to cut at the last period, if it's not too far...
+ last_dot_pos = paragraph_str.rfind(".")
+ if INTRO_DESIRED_LENGTH - last_dot_pos > INTRO_MAX_EXTRA_LENGTH:
+ intro_str = paragraph_str[:last_dot_pos + 1]
+ else:
+ paragraph_str += "..." # If too far, just add more dots
+
+ return paragraph_str # return after the first paragraph
+
+ return ""
+
+ @staticmethod
+ def _get_first_image_path(root) -> Optional[str]:
+ if isinstance(root, marko.inline.Image):
+ return root.dest
+
+ if isinstance(root, ExtendedImageElement):
+ return root.src
+
+ if hasattr(root, 'children'):
+ for elm in root.children:
+ img = Post._get_first_image_path(elm)
+ if img:
+ return img
+
+ return None
+
+
+def get_posts(path: str) -> list[Post]:
+ return_list = []
+
+ for directory in os.listdir(path):
+ return_list.append(Post(os.path.join(path, directory)))
+
+ return_list.sort(key=lambda post: post.meta_data[PUBLISH_DATE_KEY], reverse=True)
+
+ posts_cache.clear()
+ posts_cache.extend(return_list)
+
+ return return_list