1 files changed, 192 insertions, 0 deletions
diff --git a/scripts/spider.py b/scripts/spider.py
new file mode 100644
index 0000000..c73a1ff
--- /dev/null
+++ b/scripts/spider.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import urllib.parse
+import urllib.robotparser
+import html.parser
+import json
+import requests
+import mimetypes
+from collections import deque
+
+class LinkExtractor(html.parser.HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.links = []
+        self.images = []
+        self.scripts = []
+        self.styles = []
+        self.text_parts = []
+        self.title = ''
+        self.meta = {}
+        self.in_title = False
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'a':
+            for attr, value in attrs:
+                if attr == 'href':
+                    self.links.append(value)
+        elif tag == 'img':
+            for attr, value in attrs:
+                if attr == 'src':
+                    self.images.append(value)
+        elif tag == 'script':
+            for attr, value in attrs:
+                if attr == 'src':
+                    self.scripts.append(value)
+        elif tag == 'link':
+            rel = None
+            href = None
+            for attr, value in attrs:
+                if attr == 'rel':
+                    rel = value
+                elif attr == 'href':
+                    href = value
+            if rel in ['stylesheet', 'icon'] and href:
+                self.styles.append(href)
+        elif tag == 'title':
+            self.in_title = True
+        elif tag == 'meta':
+            name = None
+            content = None
+            for attr, value in attrs:
+                if attr == 'name' or attr == 'property':
+                    name = value
+                elif attr == 'content':
+                    content = value
+            if name and content:
+                self.meta[name] = content
+
+    def handle_endtag(self, tag):
+        if tag == 'title':
+            self.in_title = False
+
+    def handle_data(self, data):
+        if self.in_title:
+            self.title += data
+        else:
+            self.text_parts.append(data.strip())
+
+def download_asset(url, base_path, timeout=10):
+    try:
+        resp = requests.get(url, timeout=timeout)
+        if resp.status_code == 200:
+            content_type = resp.headers.get('content-type', '')
+            ext = mimetypes.guess_extension(content_type) or '.bin'
+            filename = os.path.basename(urllib.parse.urlparse(url).path)
+            if not filename:
+                filename = 'asset' + ext
+            elif not os.path.splitext(filename)[1]:
+                filename += ext
+            filepath = os.path.join(base_path, filename)
+            os.makedirs(os.path.dirname(filepath), exist_ok=True)
+            with open(filepath, 'wb') as f:
+                f.write(resp.content)
+            return filepath
+    except Exception as e:
+        print(f"Error downloading {url}: {e}")
+    return None
+
+def main():
+    base_url = 'https://nixtamal.toast.al'
+    
+    # Check robots.txt
+    rp = urllib.robotparser.RobotFileParser()
+    rp.set_url(base_url + '/robots.txt')
+    try:
+        rp.read()
+        if not rp.can_fetch('*', base_url + '/'):
+            print("Crawling not allowed by robots.txt")
+            sys.exit(1)
+    except:
+        print("Could not read robots.txt, proceeding assuming allowed")
+    
+    # Create directories
+    os.makedirs('docs/archive', exist_ok=True)
+    os.makedirs('docs/archive/assets', exist_ok=True)
+    
+    visited = set()
+    queue = deque([base_url])
+    pages_data = {}
+    
+    while queue:
+        url = queue.popleft()
+        if url in visited:
+            continue
+        visited.add(url)
+        print(f"Crawling: {url}")
+        
+        try:
+            resp = requests.get(url, timeout=10)
+            if resp.status_code != 200:
+                print(f"Skipping {url} with status {resp.status_code}")
+                continue
+            
+            content = resp.text
+            parser = LinkExtractor()
+            parser.feed(content)
+            
+            # Make links absolute
+            abs_links = []
+            for link in parser.links:
+                abs_link = urllib.parse.urljoin(url, link)
+                if abs_link.startswith(base_url):
+                    abs_links.append(abs_link)
+                    if abs_link not in visited and abs_link not in queue:
+                        queue.append(abs_link)
+            
+            # Download assets
+            assets = []
+            for img in parser.images:
+                img_url = urllib.parse.urljoin(url, img)
+                if img_url.startswith(base_url):
+                    path = download_asset(img_url, 'docs/archive/assets')
+                    if path:
+                        assets.append({'type': 'image', 'url': img_url, 'local_path': path})
+            
+            for script in parser.scripts:
+                script_url = urllib.parse.urljoin(url, script)
+                if script_url.startswith(base_url):
+                    path = download_asset(script_url, 'docs/archive/assets')
+                    if path:
+                        assets.append({'type': 'script', 'url': script_url, 'local_path': path})
+            
+            for style in parser.styles:
+                style_url = urllib.parse.urljoin(url, style)
+                if style_url.startswith(base_url):
+                    path = download_asset(style_url, 'docs/archive/assets')
+                    if path:
+                        assets.append({'type': 'style', 'url': style_url, 'local_path': path})
+            
+            # Save page
+            path = urllib.parse.urlparse(url).path
+            if not path or path == '/':
+                filename = 'index.html'
+            else:
+                filename = path.strip('/').replace('/', '_') + '.html'
+            filepath = os.path.join('docs/archive', filename)
+            with open(filepath, 'w', encoding='utf-8') as f:
+                f.write(content)
+            
+            # Collect data
+            pages_data[url] = {
+                'title': parser.title,
+                'meta': parser.meta,
+                'text': ' '.join(parser.text_parts),
+                'links': abs_links,
+                'assets': assets,
+                'local_file': filepath
+            }
+        
+        except Exception as e:
+            print(f"Error crawling {url}: {e}")
+    
+    # Save structure
+    with open('docs/archive/structure.json', 'w', encoding='utf-8') as f:
+        json.dump(pages_data, f, indent=2, ensure_ascii=False)
+    
+    print("Crawling complete. Data saved to docs/archive/")
+
+if __name__ == '__main__':
+    main()
+\ No newline at end of file