From bea80928f694671010bc99493d31879df7d42836 Mon Sep 17 00:00:00 2001 From: Botond Hende Date: Wed, 4 Sep 2024 20:05:51 +0200 Subject: initial commit --- modules/blogpost_processor.py | 228 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 modules/blogpost_processor.py (limited to 'modules/blogpost_processor.py') diff --git a/modules/blogpost_processor.py b/modules/blogpost_processor.py new file mode 100644 index 0000000..b29d91c --- /dev/null +++ b/modules/blogpost_processor.py @@ -0,0 +1,228 @@ +import datetime +import email.utils +import json +import os.path +import random +import urllib +from typing import Optional + +import marko +import marko.inline + +from .warko.strikethrough import StrikeThrough +from .warko.headinginjector import get_heading_injector +from .warko.extendedimage import get_image_renderer, ExtendedImageElement +from .warko.newtablink import get_new_tab_links +from ..config import Config + +CONTENT_FILE_NAME = "content.md" +META_FILE_NAME = "meta.json" + +TITLE_KEY = "title" +PUBLISH_DATE_KEY = "publish_date" +TAGS_KEY = "tags" +THUMBNAIL_KEY = "thumbnail" +INTRO_KEY = "intro" + +INTRO_DESIRED_LENGTH = 320 +INTRO_MAX_EXTRA_LENGTH = 100 + +posts_cache = [] + + +def get_posts_from_cache() -> list('Post'): + if len(posts_cache) == 0: + raise Exception("Getting posts list from cache while it is still empty.") + + return posts_cache.copy() + + +class Post: + PARSER = marko.Markdown(renderer=marko.HTMLRenderer + , extensions=[ + StrikeThrough + , get_heading_injector("> ") + , get_image_renderer(["blog.wazul.moe"]) + , get_new_tab_links(["blog.wazul.moe"]) + ] + ) + + def __init__(self, path: str): + self.path = path + self.name = os.path.basename(path) + + with open(os.path.join(path, CONTENT_FILE_NAME)) as f: + self.content = Post.PARSER.parse(f.read()) + + self.html = Post.PARSER.render(self.content) + + with open(os.path.join(path, META_FILE_NAME)) as f: + self.meta_data = json.load(f) + + self.subpages = [] + self.extra_files = [] + for elem in os.listdir(path): + if elem != CONTENT_FILE_NAME and elem != META_FILE_NAME: + if elem.endswith(".md"): + with open(os.path.join(path, elem)) as f: + subpage_content = Post.PARSER.parse(f.read()) + + self.subpages.append((os.path.basename(elem)[:-3], Post.PARSER.render(subpage_content))) + else: + self.extra_files.append(elem) + + self.href = f"/posts/{self.get_publish_year()}/{self.name}" + + if THUMBNAIL_KEY in self.meta_data: + self.thumbnail = self.meta_data[THUMBNAIL_KEY] + else: + self.thumbnail = Post._get_first_image_path(self.content) + + self.thumbnail = f"{self.href}/{self.thumbnail}" + + if INTRO_KEY in self.meta_data: + self.intro = self.meta_data[INTRO_KEY] + else: + intro_str = Post._extract_first_paragraph_text(self.content) + if len(intro_str) > 320: + intro_str = intro_str[:320] # cut to length + # first try to cut at the last period, if it's not too far... + last_dot_pos = intro_str.rfind(".") + if 320 - last_dot_pos > 100: + intro_str = intro_str[:last_dot_pos + 1] + else: + intro_str += "..." # If too far, just add more dots + + self.intro = intro_str + + def title(self) -> str: + return self.meta_data[TITLE_KEY] + + def get_tags(self) -> str: + return " ".join(["{}".format(Config.BLOG_ROOT_URL, tag, tag) for tag in + self.meta_data[TAGS_KEY]]) + + def get_link(self): + return urllib.parse.urljoin(Config.BLOG_ROOT_URL, self.href) + + def get_publish_time(self) -> str: + return self.meta_data[PUBLISH_DATE_KEY] + + def get_publish_time_rfc2822(self) -> str: + return email.utils.format_datetime(datetime.datetime.fromisoformat(self.get_publish_time() + " 12:00+02:00")) + + def get_publish_year(self) -> str: + return self.meta_data[PUBLISH_DATE_KEY][0:4] + + def get_fake_path(self) -> str: + return "~/posts/{}/{}".format(self.get_publish_year(), self.name) + + def get_prompt(self, cmd: str) -> str: + return Config.get_prompt(self.get_fake_path(), cmd) + + def get_cat_prompt(self, file: str) -> str: + return self.get_prompt(f"cat {file}") + + def get_index_prompt(self) -> str: + return Config.get_prompt("~", f"head {self.get_fake_path()}/content") + + def get_similar_posts(self) -> list['Post']: + ret_list = [] + + posts = get_posts_from_cache() + idx_self = posts.index(self) + + # TODO move to config + MAX_SIMILAR_POST_COUNT = 5 + POSTS_AROUND_DISTANCE = 5 + + # add the previous post + if idx_self < len(posts) - 1: + ret_list.append(posts.pop(idx_self + 1)) + + # TODO add some tagbased search when I have more content + + # fallback: add random posts from around the current post + posts_around = posts[idx_self - POSTS_AROUND_DISTANCE:idx_self + POSTS_AROUND_DISTANCE] + posts_around.remove(self) + + while len(posts_around) > 0 and len(ret_list) < MAX_SIMILAR_POST_COUNT: + rand_index = random.randint(0, len(posts_around) - 1) + ret_list.append(posts_around.pop(rand_index)) + + return ret_list + + def get_similar_posts_ls(self) -> str: + # TODO fix + return self.generate_similar_posts_ls([self, self, self]) + + @staticmethod + def generate_similar_posts_ls(other_posts: list['Post']) -> str: + lines = ["total 0"] + + for post in other_posts: + lines.append("lrwxrwxrwx 1 {} {} {} {} 11:11 '{}' -> {}".format( # TODO random time and fix filename escape + Config.BLOG_OWNER, Config.BLOG_OWNER, len(post.get_fake_path()), post.get_publish_time(), post.title(), + post.get_fake_path()) + ) + + return "
".join(lines) + + @staticmethod + def _extract_first_paragraph_text(root) -> str: + for child in root.children: + if isinstance(child, marko.block.Paragraph): + paragraph_str = "" + for part in child.children: + if isinstance(part, marko.inline.RawText) and isinstance(part.children, str): + paragraph_str += part.children + if isinstance(part, marko.inline.LineBreak): + if paragraph_str[-1:] != " ": + paragraph_str += " " + if isinstance(part, marko.inline.Link): + for part_child in part.children: + if isinstance(part_child, marko.inline.RawText) and isinstance(part_child.children, str): + paragraph_str += part_child.children + + if len(paragraph_str) > INTRO_DESIRED_LENGTH: + paragraph_str = paragraph_str[:INTRO_DESIRED_LENGTH] # cut to length + # first try to cut at the last period, if it's not too far... + last_dot_pos = paragraph_str.rfind(".") + if INTRO_DESIRED_LENGTH - last_dot_pos > INTRO_MAX_EXTRA_LENGTH: + intro_str = paragraph_str[:last_dot_pos + 1] + else: + paragraph_str += "..." # If too far, just add more dots + + return paragraph_str # return after the first paragraph + + return "" + + @staticmethod + def _get_first_image_path(root) -> Optional[str]: + if isinstance(root, marko.inline.Image): + return root.dest + + if isinstance(root, ExtendedImageElement): + return root.src + + if hasattr(root, 'children'): + for elm in root.children: + img = Post._get_first_image_path(elm) + if img: + return img + + return None + + +def get_posts(path: str) -> list[Post]: + return_list = [] + + for directory in os.listdir(path): + return_list.append(Post(os.path.join(path, directory))) + + return_list.sort(key=lambda post: post.meta_data[PUBLISH_DATE_KEY], reverse=True) + + posts_cache.clear() + posts_cache.extend(return_list) + + return return_list -- cgit v1.2.3-70-g09d2