You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
319 lines
11 KiB
319 lines
11 KiB
"""
|
|
Scanner module for the manual pipeline.
|
|
|
|
Scans manual_posts/ directories for screenshots (.png), audio files (.mp3),
|
|
and optional text files (.txt). Builds a unified post_object for processing.
|
|
|
|
Folder convention:
|
|
manual_posts/
|
|
└── my_post_001/
|
|
├── meta.json (optional - metadata)
|
|
├── 0_title.png (required - screenshot of post title)
|
|
├── 0_title.mp3 (preferred - pre-recorded audio)
|
|
├── 0_title.txt (fallback - text for TTS if no .mp3)
|
|
├── 1_comment.png (optional - comment screenshots)
|
|
├── 1_comment.mp3 (preferred - pre-recorded audio)
|
|
├── 1_comment.txt (fallback - text for TTS if no .mp3)
|
|
└── ...
|
|
|
|
Priority: .mp3 > .txt (if both exist, .mp3 is used and TTS is skipped).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
from utils.console import print_step, print_substep
|
|
|
|
|
|
class PostScanner:
|
|
"""Scans manual_posts/ directory, validates structure, builds post_object."""
|
|
|
|
# Regex pattern: <number>_<type>.<ext> where ext is png/jpg/jpeg/mp3/txt
|
|
FILE_PATTERN = re.compile(r"^(\d+)_(title|comment)\.(png|jpg|jpeg|mp3|txt)$", re.IGNORECASE)
|
|
|
|
def __init__(self, input_dir: str = "manual_posts"):
|
|
self.input_dir = Path(input_dir)
|
|
|
|
def scan_all(self) -> List[dict]:
|
|
"""Scan all post folders in the input directory.
|
|
|
|
Returns:
|
|
List of post_object dicts, sorted by folder name
|
|
"""
|
|
if not self.input_dir.exists():
|
|
print_substep(f"Input directory '{self.input_dir}' does not exist.", style="red")
|
|
return []
|
|
|
|
posts = []
|
|
for post_dir in sorted(self.input_dir.iterdir()):
|
|
if post_dir.is_dir() and not post_dir.name.startswith("."):
|
|
post_obj = self.scan_one(post_dir.name)
|
|
if post_obj is not None:
|
|
posts.append(post_obj)
|
|
|
|
return posts
|
|
|
|
def scan_one(self, post_id: str) -> Optional[dict]:
|
|
"""Scan a single post folder and build post_object.
|
|
|
|
Args:
|
|
post_id: Name of the folder inside manual_posts/
|
|
|
|
Returns:
|
|
post_object dict or None if invalid
|
|
"""
|
|
post_dir = self.input_dir / post_id
|
|
|
|
if not post_dir.exists():
|
|
print_substep(f"Post directory '{post_dir}' does not exist.", style="red")
|
|
return None
|
|
|
|
is_valid, errors = self.validate(post_dir)
|
|
if not is_valid:
|
|
print_substep(f"Validation failed for '{post_id}':", style="red")
|
|
for err in errors:
|
|
print_substep(f" ✗ {err}", style="red")
|
|
return None
|
|
|
|
return self._build_post_object(post_dir)
|
|
|
|
def validate(self, post_dir: Path) -> Tuple[bool, List[str]]:
|
|
"""Validate a post folder structure.
|
|
|
|
Checks:
|
|
- At least 1 image file exists
|
|
- Title image (0_title.png) exists
|
|
- Each image has a corresponding .mp3 or .txt file
|
|
- Files follow naming convention
|
|
|
|
Returns:
|
|
(is_valid, list_of_errors)
|
|
"""
|
|
errors = []
|
|
|
|
# Gather all matching files
|
|
images, audios, texts = self._categorize_files(post_dir)
|
|
|
|
# Check: at least 1 image
|
|
if not images:
|
|
errors.append("No image files found. Need at least 0_title.png")
|
|
return False, errors
|
|
|
|
# Check: title image exists (index 0)
|
|
if 0 not in images:
|
|
errors.append("Missing title image: 0_title.png (must start with '0_')")
|
|
|
|
# Check: each image has a corresponding .mp3 or .txt file
|
|
for idx in sorted(images.keys()):
|
|
if idx not in audios and idx not in texts:
|
|
errors.append(
|
|
f"Missing audio/text for image #{idx}: "
|
|
f"provide '{idx}_title.mp3' (or .txt as fallback)"
|
|
)
|
|
|
|
# Check: text files (used as TTS fallback) are not empty
|
|
for idx, txt_path in texts.items():
|
|
if idx not in audios: # Only check .txt if no .mp3 exists
|
|
content = txt_path.read_text(encoding="utf-8").strip()
|
|
if not content:
|
|
errors.append(f"Text file is empty (and no .mp3 provided): {txt_path.name}")
|
|
|
|
return len(errors) == 0, errors
|
|
|
|
def list_status(self) -> List[dict]:
|
|
"""List all posts with their status.
|
|
|
|
Returns:
|
|
List of dicts with keys: post_id, num_images, num_audios, num_texts, status
|
|
"""
|
|
if not self.input_dir.exists():
|
|
return []
|
|
|
|
results = []
|
|
for post_dir in sorted(self.input_dir.iterdir()):
|
|
if not post_dir.is_dir() or post_dir.name.startswith("."):
|
|
continue
|
|
|
|
images, audios, texts = self._categorize_files(post_dir)
|
|
is_valid, errors = self.validate(post_dir)
|
|
|
|
# Determine status
|
|
if not images:
|
|
status = "empty"
|
|
elif not is_valid:
|
|
status = "incomplete"
|
|
else:
|
|
status = "ready"
|
|
|
|
results.append(
|
|
{
|
|
"post_id": post_dir.name,
|
|
"num_images": len(images),
|
|
"num_audios": len(audios),
|
|
"num_texts": len(texts),
|
|
"status": status,
|
|
"errors": errors,
|
|
}
|
|
)
|
|
|
|
return results
|
|
|
|
def _categorize_files(self, post_dir: Path) -> Tuple[Dict[int, Path], Dict[int, Path], Dict[int, Path]]:
|
|
"""Categorize files in a post directory into images, audios, and texts.
|
|
|
|
Returns:
|
|
(images_dict, audios_dict, texts_dict) where key is the index number
|
|
"""
|
|
images = {} # {0: Path("0_title.png"), ...}
|
|
audios = {} # {0: Path("0_title.mp3"), ...}
|
|
texts = {} # {0: Path("0_title.txt"), ...}
|
|
|
|
for f in post_dir.iterdir():
|
|
match = self.FILE_PATTERN.match(f.name)
|
|
if match:
|
|
idx = int(match.group(1))
|
|
ext = match.group(3).lower()
|
|
if ext in ("png", "jpg", "jpeg"):
|
|
images[idx] = f
|
|
elif ext == "mp3":
|
|
audios[idx] = f
|
|
elif ext == "txt":
|
|
texts[idx] = f
|
|
|
|
return images, audios, texts
|
|
|
|
def _build_post_object(self, post_dir: Path) -> dict:
|
|
"""Build the unified post_object from a validated post directory.
|
|
|
|
Returns:
|
|
dict with structure:
|
|
{
|
|
"post_id": str,
|
|
"platform": str,
|
|
"title": str,
|
|
"author": str,
|
|
"url": str,
|
|
"post_dir": str,
|
|
"screenshots": [
|
|
{
|
|
"index": int,
|
|
"type": "title" | "comment",
|
|
"image_path": str,
|
|
"text": str,
|
|
"audio_path": None,
|
|
"audio_duration": None,
|
|
},
|
|
...
|
|
],
|
|
"total_duration": 0,
|
|
"output_path": None,
|
|
}
|
|
"""
|
|
post_id = post_dir.name
|
|
|
|
# Read optional meta.json
|
|
meta = self._read_meta(post_dir)
|
|
|
|
# Categorize files
|
|
images, audios, texts = self._categorize_files(post_dir)
|
|
|
|
# Build screenshots list (sorted by index)
|
|
screenshots = []
|
|
for idx in sorted(images.keys()):
|
|
img_path = images[idx]
|
|
# Determine type from filename
|
|
match = self.FILE_PATTERN.match(img_path.name)
|
|
entry_type = match.group(2).lower() if match else "comment"
|
|
|
|
# Audio: prefer .mp3, fallback to .txt for TTS
|
|
audio_path = str(audios[idx]) if idx in audios else None
|
|
text_content = ""
|
|
if idx in texts:
|
|
text_content = texts[idx].read_text(encoding="utf-8").strip()
|
|
|
|
screenshots.append(
|
|
{
|
|
"index": idx,
|
|
"type": entry_type,
|
|
"image_path": str(img_path),
|
|
"text": text_content,
|
|
"audio_path": audio_path, # Pre-filled if .mp3 exists
|
|
"audio_duration": None,
|
|
}
|
|
)
|
|
|
|
# Use title text, meta title, or folder name
|
|
title = ""
|
|
if screenshots and screenshots[0]["text"]:
|
|
title = screenshots[0]["text"][:100]
|
|
elif meta.get("title"):
|
|
title = meta["title"]
|
|
else:
|
|
title = post_id
|
|
|
|
return {
|
|
"post_id": post_id,
|
|
"platform": meta.get("platform", "other"),
|
|
"title": title,
|
|
"author": meta.get("author", ""),
|
|
"url": meta.get("url", ""),
|
|
"post_dir": str(post_dir),
|
|
"screenshots": screenshots,
|
|
"total_duration": 0,
|
|
"output_path": None,
|
|
}
|
|
|
|
def _read_meta(self, post_dir: Path) -> dict:
|
|
"""Read meta.json if it exists, return empty dict otherwise."""
|
|
meta_path = post_dir / "meta.json"
|
|
if meta_path.exists():
|
|
try:
|
|
with open(meta_path, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, IOError) as e:
|
|
print_substep(f"Warning: Could not read meta.json: {e}", style="yellow")
|
|
return {}
|
|
|
|
|
|
def create_post_folder(input_dir: str, post_id: str, platform: str = "reddit") -> Path:
|
|
"""Create a new post folder with template files.
|
|
|
|
Args:
|
|
input_dir: Base directory for manual posts
|
|
post_id: Name for the new post folder
|
|
platform: Source platform (reddit, threads, x, other)
|
|
|
|
Returns:
|
|
Path to the created folder
|
|
"""
|
|
post_dir = Path(input_dir) / post_id
|
|
post_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Create meta.json template
|
|
meta = {
|
|
"platform": platform,
|
|
"post_id": post_id,
|
|
"title": "",
|
|
"author": "",
|
|
"url": "",
|
|
"created_at": "",
|
|
"tags": [],
|
|
"notes": "",
|
|
}
|
|
meta_path = post_dir / "meta.json"
|
|
if not meta_path.exists():
|
|
with open(meta_path, "w", encoding="utf-8") as f:
|
|
json.dump(meta, f, indent=4, ensure_ascii=False)
|
|
|
|
print_step(f"Created post folder: {post_dir}")
|
|
print_substep("Next steps:", style="bold cyan")
|
|
print_substep(" 1. Add screenshots: 0_title.png, 1_comment.png, ...")
|
|
print_substep(" 2. Add audio files: 0_title.mp3, 1_comment.mp3, ...")
|
|
print_substep(" (Or use .txt files instead — TTS will generate audio)")
|
|
print_substep(" 3. (Optional) Edit meta.json with post details")
|
|
print_substep(f" 4. Run: python manual_main.py render {post_id}")
|
|
|
|
return post_dir
|