From 9748b380710bcd28c5f4a8e195283cdb96113f04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaakko=20Kera=CC=88nen?= Date: Mon, 5 Jun 2023 19:21:54 +0300 Subject: [PATCH 1/1] Archive: Time range based archiving Added "/export/month/YYYY-MM.gpub" for exporting a monthly archive. --- model.py | 62 +++++++++++---- subspace.py | 213 +++++++++++++++++++++++++++++++++++++++++----------- utils.py | 5 ++ 3 files changed, 222 insertions(+), 58 deletions(-) diff --git a/model.py b/model.py index 306a97b..2ddf23b 100644 --- a/model.py +++ b/model.py @@ -6,7 +6,7 @@ import re import shutil import time from typing import Union -from utils import ago_text, clean_title, parse_at_names, shorten_text, \ +from utils import ago_text, clean_title, parse_at_names, shorten_text, strip_links, \ GeminiError, UTC, INNER_LINK_PREFIX @@ -149,8 +149,9 @@ class Notification: event = f'removed you as moderator of s/{self.subname}' if with_title: - vis_title = self.post_title if self.post_title else \ - shorten_text(clean_title(self.post_summary), 50) if self.post_summary else None + vis_title = shorten_text(self.post_title, 50) if self.post_title \ + else shorten_text(strip_links(clean_title(self.post_summary)), 50) if self.post_summary \ + else None if vis_title: if self.type == Notification.MENTION: event += ' in' @@ -455,7 +456,11 @@ class Database: ts_edited TIMESTAMP DEFAULT CURRENT_TIMESTAMP, ts_comment TIMESTAMP DEFAULT CURRENT_TIMESTAMP, -- time of latest comment summary TEXT DEFAULT '', - UNIQUE KEY (subspace, issueid) + UNIQUE KEY (subspace, issueid), + INDEX (subspace), + INDEX (parent), + INDEX (user), + INDEX (issueid) )""") db.execute("""CREATE TABLE IF NOT EXISTS tags ( @@ -864,6 +869,22 @@ class Database: url, label, post)) return files + def get_time_files(self, ts_range): + cur = self.conn.cursor() + cur.execute(""" + SELECT + f.id, f.segment, f.name, f.mimetype, f.data, s.url, s.content, s.post, p.user + FROM files f + JOIN segments s ON s.id=f.segment + JOIN posts p ON s.post=p.id + WHERE UNIX_TIMESTAMP(p.ts_edited)>=? AND UNIX_TIMESTAMP(p.ts_edited)=? AND UNIX_TIMESTAMP(p.ts_edited) 0: + fn = '_' + fn + #if len(fn) == 0: + # fn = f'{self.dt.day}_post{self.post_id}.gmi' + return f'{self.dt.year:04d}-{self.dt.month:02d}/{self.post_id}{fn}.gmi' + + def __init__(self, session, user=None, subspace=None, month_range=None): self.session = session self.db = session.db + self.ts_range = None + if month_range: + year, month = month_range + end_month = month + 1 if month < 12 else 1 + end_year = year if month < 12 else year + 1 + self.ts_range = ( + datetime.datetime(year, month, 1, 0, 0, 0, tzinfo=UTC).timestamp(), + datetime.datetime(end_year, end_month, 1, 0, 0, 0, tzinfo=UTC).timestamp() + ) self.user = user self.subspace = subspace - self.is_user = subspace.owner != 0 + self.is_user = self.ts_range is None and subspace.owner != 0 assert self.is_user and self.user or not self.is_user and not self.user - assert self.subspace is not None + assert self.ts_range or self.subspace is not None # Modify settion so rendered pages appear to be not logged in. session.user = None self.site_link = session.server_root() - - generator = f'Generated with 💬 Bubble v{session.bubble.version}.' + if month_range: + archive_title = f'{datetime.datetime(year, month, 1).strftime("%B %Y")}' + archive_description = f'All posts and comments made on {session.bubble.site_name}. ' + else: + archive_title = f'{"s/" if not self.is_user else ""}{subspace.name} on {session.bubble.site_name}' + archive_description = \ + (f'All posts and comments made in the subspace {subspace.title()} on {session.bubble.site_name}. ' if not self.is_user else f'All posts and comments made by {user.name} on {session.bubble.site_name}. ') self.metadata = { 'gpubVersion': '1.0.0', - 'title': f'{"s/" if not self.is_user else ""}{subspace.name} on {session.bubble.site_name}', - 'author': 'Bubble Archiver' if not self.is_user else user.name, + 'title': archive_title, + 'description': archive_description, + 'author': f'Bubble v{session.bubble.version}', 'publishDate': time.strftime('%Y-%m-%d'), - 'index': 'index.gmi', - 'description': (f'All posts and comments made in the subspace {subspace.title()} on {session.bubble.site_name}. ' if not self.is_user else f'All posts and comments made by {user.name} on {session.bubble.site_name}. ') + generator + 'index': 'index.gmi' } self.local_entries = [] # posts in the archive's subspace self.foreign_entries = [] # posts in other subspaces + self.subspace_entries = {} # subspace name => list of entries self.comment_entries = [] # posts where user has commented self.file_entries = [] # files self.entry_index = {} # indexed by post ID self.file_index = {} # indexed by file ID self.referenced_users = {} # info about posters + self.total_count = [0, 0] + self.subspace_count = {} # [posts, comments] self.subspaces = {} self.users = {} @@ -452,15 +477,18 @@ class GempubArchive: self.session.context = self.get_subspace(post.subspace) self.session.is_context_tracker = (self.session.context.flags & Subspace.ISSUE_TRACKER) != 0 - is_local = post.subspace == self.subspace.id - where = self.session.context.title() if not is_local and ( - not self.is_user or is_comment) else None - label_sub = ' · ' + where if where else '' + is_local = (post.subspace == self.subspace.id) if self.subspace else False + if not self.ts_range: + where = self.session.context.title() if not is_local and ( + not self.is_user or is_comment) else None + label_sub = ' · ' + where if where else '' page = make_post_page(self.session, post) - entry = GempubArchive.Entry(post, - (post.title if post.title else shorten_text(clean_title(post.summary), 100)) + label_sub, - page) + if self.ts_range: + label = shorten_text(clean_title(strip_links(post.summary)), 150) + else: + label = (post.title if post.title else shorten_text(clean_title(strip_links(post.summary)), 100)) + label_sub + entry = GempubArchive.Entry(post, label, page) # Check for referenced users. for username in re.findall(r'=> /u/([\w-]+)\s', page): @@ -475,16 +503,40 @@ class GempubArchive: else: self.foreign_entries.append(entry) + skey = self.session.context.name + if skey in self.subspace_entries: + self.subspace_entries[skey].append(entry) + else: + self.subspace_entries[skey] = [entry] + + if not post.id in self.entry_index: + if not is_comment: + self.add_count(post.subspace, + (1, self.db.count_posts(parent_id=post.id, draft=False))) + self.entry_index[post.id] = entry + def add_count(self, subspace_id, count): + self.total_count[0] += count[0] + self.total_count[1] += count[1] + if not subspace_id in self.subspace_count: + self.subspace_count[subspace_id] = [count[0], count[1]] + else: + self.subspace_count[subspace_id][0] += count[0] + self.subspace_count[subspace_id][1] += count[1] + def render_post_entries(self): db = self.db # Entries for the user/subspace posts. if self.is_user: posts = db.get_posts(user=self.user, comment=False, draft=False) + elif self.ts_range: + posts = db.get_posts(ts_range=self.ts_range, comment=False, draft=False, + sort_descending=False) else: posts = db.get_posts(subspace=self.subspace, comment=False, draft=False) + for post in posts: self.add_post_entry(post) @@ -492,7 +544,8 @@ class GempubArchive: # Make entries for posts where user has commented in. # TODO: Add a proper database query for this. commented_in = set() - for cmt in db.get_posts(user=self.user, comment=True, draft=False): + for cmt in db.get_posts(user=self.user, comment=True, draft=False, + sort_descending=False): commented_in.add(cmt.parent) for post in [db.get_post(id=post_id) for post_id in commented_in]: if post and post.user != self.user.id: @@ -501,7 +554,8 @@ class GempubArchive: def render_file_entries(self): db = self.db for file in db.get_user_files(self.user) if self.user \ - else db.get_subspace_files(self.subspace): + else db.get_subspace_files(self.subspace) if self.subspace \ + else db.get_time_files(self.ts_range): post = db.get_post(id=file.segment_post) filesize = len(file.data) entry = GempubArchive.Entry(post, @@ -516,7 +570,10 @@ class GempubArchive: src_post_id = entry.post_id user_pattern = re.compile(r'^=>\s*/u/([\w%-]+)\s') - post_pattern = re.compile(r'^=>\s*/([us])/' + self.subspace.name + r'/(\d+)\s') + if self.subspace: + post_pattern = re.compile(r'^=>\s*/([us])/' + self.subspace.name + r'/(\d+)\s') + else: + post_pattern = re.compile(r'^=>\s*/([us])/[\w%-]+/(\d+)\s') file_pattern = re.compile(r'^=>\s*/([us])/[\w%-]+/(image|file)/(\d+)[^ ]*\s') root_pattern = re.compile(r'^=>\s*/([^ ]*)\s') rewritten = [] @@ -561,6 +618,14 @@ class GempubArchive: buffer = io.BytesIO() zip = zipfile.ZipFile(buffer, 'w', compression=zipfile.ZIP_DEFLATED, compresslevel=9) + def counter_text(count): + parts = [] + if count[0]: + parts.append(f'{count[0]} post{plural_s(count[0])}') + if count[1]: + parts.append(f'{count[1]} comment{plural_s(count[1])}') + return ' and '.join(parts) + with zip.open('metadata.txt', 'w') as f: for entry in self.metadata: f.write(f"{entry}: {self.metadata[entry]}\n".encode('utf-8')) @@ -568,7 +633,7 @@ class GempubArchive: with zip.open('title.gmi', 'w') as f: f.write(f""" -# {self.user.name if self.is_user else self.subspace.name} +# {self.user.name if self.is_user else self.subspace.name if self.subspace else self.metadata['title']} ## Gempub Archive @@ -583,7 +648,7 @@ Exported on {self.metadata['publishDate']}. index_page += '\n=> title.gmi Title page\n' profile_path = 'users/' + self.user.name + '.gmi' index_page += f'=> {profile_path} {self.user.avatar} {self.user.name}\n' - else: + elif self.subspace: index_page = f'# s/{self.subspace.name}\n\nTable of Contents:\n' index_page += '\n=> title.gmi Title page\n' profile_path = self.subspace.name + '.gmi' @@ -597,20 +662,73 @@ Exported on {self.metadata['publishDate']}. src += '\nThe subspace was created on ' + \ make_timestamp(self.subspace.ts_created, '%Y-%m-%d') + '.\n' f.write(src.encode('utf-8')) + else: + index_page = '# ' + self.metadata['title'] + '\n\nTable of Contents:\n\n' + + if self.local_entries: + index_page += f'\n=> posts/index.gmi Posts in {self.subspace.title()}\n' + local_index_page = f'# Posts in {self.subspace.title()}\n\n' + for entry in self.local_entries: + entry_path = 'posts/' + entry.path() + local_index_page += f'=> {entry.path()} {entry.ymd()} {entry.label}\n' + with zip.open(entry_path, 'w') as content: + content.write(self.rewrite_internal_urls(entry).encode('utf-8')) + with zip.open('posts/index.gmi', 'w') as content: + content.write(local_index_page.encode('utf-8')) + + if self.ts_range: + sub_links = [] + for sub_name in sorted(self.subspace_entries.keys(), key=str.lower): + first_entry = self.subspace_entries[sub_name][0] + sub = self.get_subspace(first_entry.subspace_id) + entry_path = f'{sub.title()[0]}_{sub.name}.gmi' + sub_links.append(f'=> {entry_path} {sub.title()}\n') + + title_icon = '' + if sub.owner: + title_icon = f'{self.get_user(first_entry.user_id).avatar} ' + sub_page = f'# {title_icon}{sub.title()}\n' + sub_page += f'{counter_text(self.subspace_count[sub.id])} in this subspace.\n' + + for entry in self.subspace_entries[sub_name]: + entry_user = self.get_user(entry.user_id) + author = f'{entry_user.avatar} {entry_user.name}' + meta = [] + top = None + if entry.issueid: + top = f'[#{entry.issueid}] {entry.title}' + meta.append(author) + if entry.tags: + top += f' · {entry.tags}' + elif not sub.owner: + meta.append(author) + meta.append(entry.dt.strftime('%Y-%m-%d %H:%M')) + if entry.num_cmts > 0: + meta.append(f'{entry.num_cmts} comment{plural_s(entry.num_cmts)}') + if entry.num_likes > 0: + meta.append(f'{entry.num_likes} like{plural_s(entry.num_likes)}') + if entry.tags and not entry.issueid: + meta.append(entry.tags) + link = f'=> posts/{entry.path()}' + if top: + sub_page += f'\n{link} {top}\n{entry.label}\n{" · ".join(meta)}\n' + else: + sub_page += f'\n{entry.label}\n{link} {" · ".join(meta)}\n' + # Write to the archive. + with zip.open('posts/' + entry.path(), 'w') as content: + content.write(self.rewrite_internal_urls(entry).encode('utf-8')) + with zip.open(entry_path, 'w') as content: + content.write(sub_page.encode('utf-8')) - # TODO: Rewrite user/page/file/image links to point to locations inside the archive. - - index_page += f'\n=> posts/index.gmi Posts in {self.subspace.title()}\n' - local_index_page = f'# Posts in {self.subspace.title()}\n\n' - for entry in self.local_entries: - entry_path = 'posts/' + entry.path() - local_index_page += f'=> {entry.path()} {entry.ymd()} {entry.label}\n' - with zip.open(entry_path, 'w') as content: - content.write(self.rewrite_internal_urls(entry).encode('utf-8')) - with zip.open('posts/index.gmi', 'w') as content: - content.write(local_index_page.encode('utf-8')) + prev_type = None + for link in sorted(sub_links, key=str.lower): + if prev_type and prev_type != link[3]: + index_page += '\n' + index_page += link + prev_type = link[3] # u or s + index_page += '\n' - if self.foreign_entries: + elif self.foreign_entries: index_page += f'=> other/index.gmi Posts in Other Subspaces\n' foreign_index_page = '# Posts in Other Subspaces\n' last_sub = None @@ -627,7 +745,6 @@ Exported on {self.metadata['publishDate']}. with zip.open('other/index.gmi', 'w') as content: content.write(foreign_index_page.encode('utf-8')) - # Comments. if self.comment_entries: index_page += f'=> comments/index.gmi Commented Posts\n' comment_index_page = '# Commented Posts\n' @@ -639,7 +756,6 @@ Exported on {self.metadata['publishDate']}. with zip.open('comments/index.gmi', 'w') as content: content.write(comment_index_page.encode('utf-8')) - # File attachments. if self.file_entries: index_page += '=> files/index.gmi File attachments\n' file_index_page = '# File Attachments\n' @@ -688,24 +804,33 @@ def export_gempub_archive(session): return 60, 'Login required' # Determine subspace to export. - m = re.search(r'/export/(s/)?([\w%-]+)\.gpub$', req.path) + m = re.search(r'/export/(s/|month/)?([\w%-]+)\.gpub$', req.path) if not m or not m[2]: return 59, 'Bad request' name = urlparse.unquote(m[2]) - subspace = db.get_subspace(name=name) + if m[1] == 'month/': + month_range = map(int, m[2].split('-')) + subspace = None + else: + month_range = None + subspace = db.get_subspace(name=name) is_user = m[1] is None # Check access rights. At the moment, exporting is only possible via user # settings and subspace admin pages, so the user must have moderation # rights in the exported subspace. - if is_user: + if month_range: + if not user: + # Have to be logged in. + return 61, 'Not authorized' + elif is_user: if subspace.owner != user.id: return 61, 'Not authorized' else: if user.id not in map(lambda u: u.id, db.get_mods(subspace)): return 61, 'Not authorized' - archive = GempubArchive(session, user if is_user else None, subspace) + archive = GempubArchive(session, user if is_user else None, subspace, month_range) archive.render_post_entries() archive.render_file_entries() data = archive.compress() diff --git a/utils.py b/utils.py index 32549ed..8b53673 100644 --- a/utils.py +++ b/utils.py @@ -6,6 +6,7 @@ import urllib.parse as urlparse UTC = datetime.timezone.utc GEMTEXT_MARKUP = re.compile(r'^(\s*=>\s*|\* |>\s*|##?#?)') +URI_PATTERN = re.compile(r'(gemini|finger|gopher|mailto|data|file|https?|fdroidrepos?:):(//)?[^`" ]+') INNER_LINK_PREFIX = '— ' @@ -79,6 +80,10 @@ def clean_text(text): return text.rstrip() +def strip_links(text): + return URI_PATTERN.sub(r'[\1 link]', text) + + def clean_title(title): # Strip `=>` and other Gemini syntax. cleaned = [] -- 2.34.1