diff options
agentic ai; is so; fucking cool; omgmain
Diffstat (limited to 'scripts')
| -rw-r--r-- | scripts/create_inventory.py | 178 | ||||
| -rw-r--r-- | scripts/organize_archive.py | 90 | ||||
| -rw-r--r-- | scripts/spider.py | 192 | ||||
| -rwxr-xr-x | scripts/test.sh | 111 |
4 files changed, 571 insertions, 0 deletions
diff --git a/scripts/create_inventory.py b/scripts/create_inventory.py new file mode 100644 index 0000000..781b30e --- /dev/null +++ b/scripts/create_inventory.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 + +import os +import re +from collections import defaultdict + +def extract_title(content): + lines = content.split('\n') + for line in lines: + if line.startswith('# '): + title = line[2:].strip() + # Remove " | Nixtamal" if present + if ' | Nixtamal' in title: + title = title.replace(' | Nixtamal', '') + return title + return 'Untitled' + +def count_words(content): + # Exclude the footer starting from "Site made with Nix" + footer_start = content.find('Site made with Nix') + if footer_start != -1: + content = content[:footer_start] + words = re.findall(r'\w+', content) + return len(words) + +def infer_content_type(path): + relative_path = path.replace('docs/archive/organized/', '') + parts = relative_path.split('/') + if len(parts) >= 2: + section = parts[0] + if section == 'home': + return 'Home Page' + elif section == 'install': + return 'Installation Guide' + elif section == 'cookbook': + if len(parts) == 2: + return 'Cookbook Index' + else: + return 'Cookbook Recipe' + elif section == 'manpage': + return 'Manpage' + elif section == 'changelog': + return 'Changelog' + elif section == 'community': + return 'Community' + elif section == 'faqs': + return 'FAQs' + elif section == 'funding': + return 'Funding' + elif section == 'roadmap': + return 'Roadmap' + elif section == 'real-world-showcase': + return 'Real-world Showcase' + return 'Other' + +def main(): + base_path = 'docs/archive/organized' + + # Find all md files + md_files = [] + for root, dirs, files in os.walk(base_path): + for file in files: + if file.endswith('.md'): + md_files.append(os.path.join(root, file)) + + pages = [] + total_words = 0 + + for md_file in md_files: + try: + with open(md_file, 'r', encoding='utf-8') as f: + content = f.read() + title = extract_title(content) + word_count = count_words(content) + content_type = infer_content_type(md_file) + pages.append({ + 'title': title, + 'word_count': word_count, + 'type': content_type, + 'path': md_file.replace('docs/archive/organized/', '') + }) + total_words += word_count + except Exception as e: + print(f"Error reading {md_file}: {e}") + + # Assets + assets_path = os.path.join(base_path, 'assets') + assets = [] + total_asset_size = 0 + if os.path.exists(assets_path): + for root, dirs, files in os.walk(assets_path): + for file in files: + path = os.path.join(root, file) + try: + size = os.path.getsize(path) + ext = file.split('.')[-1].lower() if '.' in file else 'unknown' + assets.append({ + 'path': path.replace('docs/archive/organized/', ''), + 'type': ext, + 'size': size + }) + total_asset_size += size + except Exception as e: + print(f"Error getting info for {path}: {e}") + + # Write inventory.md + with open('docs/inventory.md', 'w', encoding='utf-8') as f: + f.write('# Content Inventory\n\n') + + f.write('## Pages\n\n') + f.write('| Title | Word Count | Type | Path |\n') + f.write('|-------|------------|------|------|\n') + for page in sorted(pages, key=lambda x: x['path']): + f.write(f"| {page['title']} | {page['word_count']} | {page['type']} | {page['path']} |\n") + + f.write('\n## Assets\n\n') + f.write('| Path | Type | Size (bytes) |\n') + f.write('|------|------|--------------|\n') + for asset in sorted(assets, key=lambda x: x['path']): + f.write(f"| {asset['path']} | {asset['type']} | {asset['size']} |\n") + + f.write('\n## Summary Statistics\n\n') + f.write(f'- Total Pages: {len(pages)}\n') + f.write(f'- Total Words: {total_words}\n') + f.write(f'- Total Assets: {len(assets)}\n') + f.write(f'- Total Asset Size: {total_asset_size} bytes\n') + + # Migration mapping + with open('docs/migration-mapping.md', 'w', encoding='utf-8') as f: + f.write('# Migration Mapping\n\n') + f.write('This document outlines how the archived Nixtamal website content will be mapped to the new single-page website structure.\n\n') + + f.write('## Section Mappings\n\n') + f.write('Each old page is mapped to a section in the new single-page layout.\n\n') + f.write('| Old Page | New Section | Priority |\n') + f.write('|----------|-------------|----------|\n') + + section_mappings = { + 'Home Page': ('Introduction/Hero', 'must-have'), + 'Installation Guide': ('Installation', 'must-have'), + 'Cookbook Index': ('Cookbook', 'should-have'), + 'Cookbook Recipe': ('Cookbook (subsection)', 'should-have'), + 'Manpage': ('Documentation/Manual', 'must-have'), + 'Changelog': ('Changelog', 'nice-to-have'), + 'Community': ('Community', 'should-have'), + 'FAQs': ('FAQs', 'should-have'), + 'Funding': ('Funding/Support', 'nice-to-have'), + 'Roadmap': ('Roadmap', 'nice-to-have'), + 'Real-world Showcase': ('Showcase', 'nice-to-have'), + 'Other': ('Miscellaneous', 'nice-to-have') + } + + for page in sorted(pages, key=lambda x: x['path']): + section, priority = section_mappings.get(page['type'], ('Other', 'nice-to-have')) + f.write(f"| {page['title']} | {section} | {priority} |\n") + + f.write('\n## Content Grouping Suggestions\n\n') + f.write('- **Introduction/Hero**: Combine home page content with key features and showcase.\n') + f.write('- **Installation**: Direct installation guide.\n') + f.write('- **Cookbook**: Group all cookbook recipes under expandable sections or tabs.\n') + f.write('- **Documentation/Manual**: Include manpages with proper formatting.\n') + f.write('- **Community**: Community links and information.\n') + f.write('- **FAQs**: Frequently asked questions.\n') + f.write('- **Changelog, Roadmap, Funding, Showcase**: Place in footer or separate sections with navigation.\n') + + f.write('\n## Priority Rankings\n\n') + f.write('- **Must-have**: Introduction/Hero, Installation, Documentation/Manual\n') + f.write('- **Should-have**: Cookbook, Community, FAQs\n') + f.write('- **Nice-to-have**: Changelog, Roadmap, Funding, Real-world Showcase\n') + + f.write('\n## Recommendations for New Website Structure\n\n') + f.write('The new single-page website should have a sticky navigation header with sections: Home, Install, Cookbook, Docs, Community.\n') + f.write('Use smooth scrolling or anchors for navigation within the page.\n') + f.write('For cookbook, use accordion or tabbed interface for recipes to keep it organized.\n') + f.write('Assets like CSS and logo should be integrated into the single-page design.\n') + +if __name__ == '__main__': + main()
\ No newline at end of file diff --git a/scripts/organize_archive.py b/scripts/organize_archive.py new file mode 100644 index 0000000..c7dfe48 --- /dev/null +++ b/scripts/organize_archive.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +import json +import os +import shutil + +def main(): + # Load structure.json + with open('docs/archive/structure.json', 'r') as f: + data = json.load(f) + + # Create organized directory + organized_dir = 'docs/archive/organized' + os.makedirs(organized_dir, exist_ok=True) + + # Copy assets + assets_src = 'docs/archive/assets' + assets_dst = os.path.join(organized_dir, 'assets') + if os.path.exists(assets_src): + shutil.copytree(assets_src, assets_dst, dirs_exist_ok=True) + + pages = [] + + for url, page_data in data.items(): + # Skip anchors and non-site URLs + if '#' in url or not url.startswith('https://nixtamal.toast.al'): + continue + + # Extract path + if url == 'https://nixtamal.toast.al' or url == 'https://nixtamal.toast.al/': + path = 'home' + else: + path = url[len('https://nixtamal.toast.al/'):].rstrip('/') + if not path: + path = 'home' + + # Create directory structure + full_path = os.path.join(organized_dir, path) + os.makedirs(full_path, exist_ok=True) + + # Get title and text + title = page_data['title'].strip() + text = page_data['text'] + + # Clean text: remove excessive whitespace + lines = text.split('\n') + cleaned_lines = [] + for line in lines: + stripped = line.strip() + if stripped: + cleaned_lines.append(stripped) + text = '\n\n'.join(cleaned_lines) + + # Filename + if '/' in path: + filename = path.split('/')[-1] + '.md' + else: + filename = path + '.md' + + # Write markdown file + md_path = os.path.join(full_path, filename) + with open(md_path, 'w', encoding='utf-8') as f: + f.write(f'# {title}\n\n{text}\n') + + # Collect for index + pages.append((path, title, md_path.replace(organized_dir + '/', ''))) + + # Create index.md + index_path = os.path.join(organized_dir, 'index.md') + with open(index_path, 'w', encoding='utf-8') as f: + f.write('# Nixtamal Documentation Archive\n\n') + f.write('This is an organized archive of the Nixtamal documentation.\n\n') + f.write('## Contents\n\n') + + # Group by top-level section + sections = {} + for path, title, rel_path in pages: + top = path.split('/')[0] if '/' in path else path + if top not in sections: + sections[top] = [] + sections[top].append((path, title, rel_path)) + + for top in sorted(sections.keys()): + f.write(f'### {top.capitalize()}\n\n') + for path, title, rel_path in sorted(sections[top]): + f.write(f'- [{title}]({rel_path})\n') + f.write('\n') + +if __name__ == '__main__': + main()
\ No newline at end of file diff --git a/scripts/spider.py b/scripts/spider.py new file mode 100644 index 0000000..c73a1ff --- /dev/null +++ b/scripts/spider.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 + +import os +import sys +import urllib.parse +import urllib.robotparser +import html.parser +import json +import requests +import mimetypes +from collections import deque + +class LinkExtractor(html.parser.HTMLParser): + def __init__(self): + super().__init__() + self.links = [] + self.images = [] + self.scripts = [] + self.styles = [] + self.text_parts = [] + self.title = '' + self.meta = {} + self.in_title = False + + def handle_starttag(self, tag, attrs): + if tag == 'a': + for attr, value in attrs: + if attr == 'href': + self.links.append(value) + elif tag == 'img': + for attr, value in attrs: + if attr == 'src': + self.images.append(value) + elif tag == 'script': + for attr, value in attrs: + if attr == 'src': + self.scripts.append(value) + elif tag == 'link': + rel = None + href = None + for attr, value in attrs: + if attr == 'rel': + rel = value + elif attr == 'href': + href = value + if rel in ['stylesheet', 'icon'] and href: + self.styles.append(href) + elif tag == 'title': + self.in_title = True + elif tag == 'meta': + name = None + content = None + for attr, value in attrs: + if attr == 'name' or attr == 'property': + name = value + elif attr == 'content': + content = value + if name and content: + self.meta[name] = content + + def handle_endtag(self, tag): + if tag == 'title': + self.in_title = False + + def handle_data(self, data): + if self.in_title: + self.title += data + else: + self.text_parts.append(data.strip()) + +def download_asset(url, base_path, timeout=10): + try: + resp = requests.get(url, timeout=timeout) + if resp.status_code == 200: + content_type = resp.headers.get('content-type', '') + ext = mimetypes.guess_extension(content_type) or '.bin' + filename = os.path.basename(urllib.parse.urlparse(url).path) + if not filename: + filename = 'asset' + ext + elif not os.path.splitext(filename)[1]: + filename += ext + filepath = os.path.join(base_path, filename) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + with open(filepath, 'wb') as f: + f.write(resp.content) + return filepath + except Exception as e: + print(f"Error downloading {url}: {e}") + return None + +def main(): + base_url = 'https://nixtamal.toast.al' + + # Check robots.txt + rp = urllib.robotparser.RobotFileParser() + rp.set_url(base_url + '/robots.txt') + try: + rp.read() + if not rp.can_fetch('*', base_url + '/'): + print("Crawling not allowed by robots.txt") + sys.exit(1) + except: + print("Could not read robots.txt, proceeding assuming allowed") + + # Create directories + os.makedirs('docs/archive', exist_ok=True) + os.makedirs('docs/archive/assets', exist_ok=True) + + visited = set() + queue = deque([base_url]) + pages_data = {} + + while queue: + url = queue.popleft() + if url in visited: + continue + visited.add(url) + print(f"Crawling: {url}") + + try: + resp = requests.get(url, timeout=10) + if resp.status_code != 200: + print(f"Skipping {url} with status {resp.status_code}") + continue + + content = resp.text + parser = LinkExtractor() + parser.feed(content) + + # Make links absolute + abs_links = [] + for link in parser.links: + abs_link = urllib.parse.urljoin(url, link) + if abs_link.startswith(base_url): + abs_links.append(abs_link) + if abs_link not in visited and abs_link not in queue: + queue.append(abs_link) + + # Download assets + assets = [] + for img in parser.images: + img_url = urllib.parse.urljoin(url, img) + if img_url.startswith(base_url): + path = download_asset(img_url, 'docs/archive/assets') + if path: + assets.append({'type': 'image', 'url': img_url, 'local_path': path}) + + for script in parser.scripts: + script_url = urllib.parse.urljoin(url, script) + if script_url.startswith(base_url): + path = download_asset(script_url, 'docs/archive/assets') + if path: + assets.append({'type': 'script', 'url': script_url, 'local_path': path}) + + for style in parser.styles: + style_url = urllib.parse.urljoin(url, style) + if style_url.startswith(base_url): + path = download_asset(style_url, 'docs/archive/assets') + if path: + assets.append({'type': 'style', 'url': style_url, 'local_path': path}) + + # Save page + path = urllib.parse.urlparse(url).path + if not path or path == '/': + filename = 'index.html' + else: + filename = path.strip('/').replace('/', '_') + '.html' + filepath = os.path.join('docs/archive', filename) + with open(filepath, 'w', encoding='utf-8') as f: + f.write(content) + + # Collect data + pages_data[url] = { + 'title': parser.title, + 'meta': parser.meta, + 'text': ' '.join(parser.text_parts), + 'links': abs_links, + 'assets': assets, + 'local_file': filepath + } + + except Exception as e: + print(f"Error crawling {url}: {e}") + + # Save structure + with open('docs/archive/structure.json', 'w', encoding='utf-8') as f: + json.dump(pages_data, f, indent=2, ensure_ascii=False) + + print("Crawling complete. Data saved to docs/archive/") + +if __name__ == '__main__': + main()
\ No newline at end of file diff --git a/scripts/test.sh b/scripts/test.sh new file mode 100755 index 0000000..a66915e --- /dev/null +++ b/scripts/test.sh @@ -0,0 +1,111 @@ +#!/run/current-system/sw/bin/bash + +# Test script for Nixtaml single-page website +# Checks file references and basic HTML structure + +echo "=== Nixtaml Website Integration Test ===" + +# Base directory +BASE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +INDEX_FILE="$BASE_DIR/index.html" + +# Check if index.html exists +if [ ! -f "$INDEX_FILE" ]; then + echo "ERROR: index.html not found at $INDEX_FILE" + exit 1 +fi + +echo "Checking file references in index.html..." + +# Extract src attributes from script tags +SCRIPT_FILES=$(grep -oP 'src="\K[^"]+' "$INDEX_FILE") +# Extract href from link tags, excluding anchors +LINK_FILES=$(grep -oP 'href="\K[^"#][^"]*' "$INDEX_FILE") + +# Combine and deduplicate +ALL_FILES=$(echo -e "$SCRIPT_FILES\n$LINK_FILES" | sort | uniq) + +MISSING_FILES=() +for file in $ALL_FILES; do + # Skip external URLs (starting with http) + if [[ $file == http* ]]; then + continue + fi + + # Check if file exists relative to BASE_DIR + if [ ! -f "$BASE_DIR/$file" ]; then + MISSING_FILES+=("$file") + fi +done + +# Report missing files +if [ ${#MISSING_FILES[@]} -gt 0 ]; then + echo "ERROR: Missing referenced files:" + for file in "${MISSING_FILES[@]}"; do + echo " - $file" + done +else + echo "✓ All referenced files exist" +fi + +# Basic HTML structure validation +echo "Validating basic HTML structure..." + +# Check for required elements +if ! grep -q "<!DOCTYPE html>" "$INDEX_FILE"; then + echo "WARNING: Missing DOCTYPE declaration" +fi + +if ! grep -q "<html" "$INDEX_FILE"; then + echo "ERROR: Missing <html> tag" +fi + +if ! grep -q "</html>" "$INDEX_FILE"; then + echo "ERROR: Missing </html> closing tag" +fi + +if ! grep -q "<head>" "$INDEX_FILE"; then + echo "ERROR: Missing <head> tag" +fi + +if ! grep -q "<body>" "$INDEX_FILE"; then + echo "ERROR: Missing <body> tag" +fi + +# Check for balanced tags (basic check) +OPEN_TAGS=$(grep -o '<[^/][^>]*>' "$INDEX_FILE" | wc -l) +CLOSE_TAGS=$(grep -o '</[^>]*>' "$INDEX_FILE" | wc -l) + +if [ "$OPEN_TAGS" -ne "$CLOSE_TAGS" ]; then + echo "WARNING: Potential unbalanced tags (open: $OPEN_TAGS, close: $CLOSE_TAGS)" +else + echo "✓ Basic tag balance looks good" +fi + +# Check for required sections +REQUIRED_SECTIONS=("home" "install" "cookbook" "docs" "community") +for section in "${REQUIRED_SECTIONS[@]}"; do + if ! grep -q "id=\"$section\"" "$INDEX_FILE"; then + echo "ERROR: Missing section with id=\"$section\"" + fi +done + +echo "✓ Section IDs present" + +# Check for WebGL canvas insertion point +if ! grep -q "webgl-bg.js" "$INDEX_FILE"; then + echo "WARNING: webgl-bg.js not referenced - WebGL background may not load" +fi + +if ! grep -q "parallax.js" "$INDEX_FILE"; then + echo "WARNING: parallax.js not referenced - Parallax effects may not work" +fi + +echo "=== Test Complete ===" + +if [ ${#MISSING_FILES[@]} -gt 0 ]; then + echo "FAIL: Issues found" + exit 1 +else + echo "PASS: All checks passed" +fi
\ No newline at end of file |
