#!/usr/bin/env python3

import os
import sys
import urllib.parse
import urllib.robotparser
import html.parser
import json
import requests
import mimetypes
from collections import deque

class LinkExtractor(html.parser.HTMLParser):
    def __init__(self):
        super().__init__()
        self.links = []
        self.images = []
        self.scripts = []
        self.styles = []
        self.text_parts = []
        self.title = ''
        self.meta = {}
        self.in_title = False

    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            for attr, value in attrs:
                if attr == 'href':
                    self.links.append(value)
        elif tag == 'img':
            for attr, value in attrs:
                if attr == 'src':
                    self.images.append(value)
        elif tag == 'script':
            for attr, value in attrs:
                if attr == 'src':
                    self.scripts.append(value)
        elif tag == 'link':
            rel = None
            href = None
            for attr, value in attrs:
                if attr == 'rel':
                    rel = value
                elif attr == 'href':
                    href = value
            if rel in ['stylesheet', 'icon'] and href:
                self.styles.append(href)
        elif tag == 'title':
            self.in_title = True
        elif tag == 'meta':
            name = None
            content = None
            for attr, value in attrs:
                if attr == 'name' or attr == 'property':
                    name = value
                elif attr == 'content':
                    content = value
            if name and content:
                self.meta[name] = content

    def handle_endtag(self, tag):
        if tag == 'title':
            self.in_title = False

    def handle_data(self, data):
        if self.in_title:
            self.title += data
        else:
            self.text_parts.append(data.strip())

def download_asset(url, base_path, timeout=10):
    try:
        resp = requests.get(url, timeout=timeout)
        if resp.status_code == 200:
            content_type = resp.headers.get('content-type', '')
            ext = mimetypes.guess_extension(content_type) or '.bin'
            filename = os.path.basename(urllib.parse.urlparse(url).path)
            if not filename:
                filename = 'asset' + ext
            elif not os.path.splitext(filename)[1]:
                filename += ext
            filepath = os.path.join(base_path, filename)
            os.makedirs(os.path.dirname(filepath), exist_ok=True)
            with open(filepath, 'wb') as f:
                f.write(resp.content)
            return filepath
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return None

def main():
    base_url = 'https://nixtamal.toast.al'
    
    # Check robots.txt
    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(base_url + '/robots.txt')
    try:
        rp.read()
        if not rp.can_fetch('*', base_url + '/'):
            print("Crawling not allowed by robots.txt")
            sys.exit(1)
    except:
        print("Could not read robots.txt, proceeding assuming allowed")
    
    # Create directories
    os.makedirs('docs/archive', exist_ok=True)
    os.makedirs('docs/archive/assets', exist_ok=True)
    
    visited = set()
    queue = deque([base_url])
    pages_data = {}
    
    while queue:
        url = queue.popleft()
        if url in visited:
            continue
        visited.add(url)
        print(f"Crawling: {url}")
        
        try:
            resp = requests.get(url, timeout=10)
            if resp.status_code != 200:
                print(f"Skipping {url} with status {resp.status_code}")
                continue
            
            content = resp.text
            parser = LinkExtractor()
            parser.feed(content)
            
            # Make links absolute
            abs_links = []
            for link in parser.links:
                abs_link = urllib.parse.urljoin(url, link)
                if abs_link.startswith(base_url):
                    abs_links.append(abs_link)
                    if abs_link not in visited and abs_link not in queue:
                        queue.append(abs_link)
            
            # Download assets
            assets = []
            for img in parser.images:
                img_url = urllib.parse.urljoin(url, img)
                if img_url.startswith(base_url):
                    path = download_asset(img_url, 'docs/archive/assets')
                    if path:
                        assets.append({'type': 'image', 'url': img_url, 'local_path': path})
            
            for script in parser.scripts:
                script_url = urllib.parse.urljoin(url, script)
                if script_url.startswith(base_url):
                    path = download_asset(script_url, 'docs/archive/assets')
                    if path:
                        assets.append({'type': 'script', 'url': script_url, 'local_path': path})
            
            for style in parser.styles:
                style_url = urllib.parse.urljoin(url, style)
                if style_url.startswith(base_url):
                    path = download_asset(style_url, 'docs/archive/assets')
                    if path:
                        assets.append({'type': 'style', 'url': style_url, 'local_path': path})
            
            # Save page
            path = urllib.parse.urlparse(url).path
            if not path or path == '/':
                filename = 'index.html'
            else:
                filename = path.strip('/').replace('/', '_') + '.html'
            filepath = os.path.join('docs/archive', filename)
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(content)
            
            # Collect data
            pages_data[url] = {
                'title': parser.title,
                'meta': parser.meta,
                'text': ' '.join(parser.text_parts),
                'links': abs_links,
                'assets': assets,
                'local_file': filepath
            }
        
        except Exception as e:
            print(f"Error crawling {url}: {e}")
    
    # Save structure
    with open('docs/archive/structure.json', 'w', encoding='utf-8') as f:
        json.dump(pages_data, f, indent=2, ensure_ascii=False)
    
    print("Crawling complete. Data saved to docs/archive/")

if __name__ == '__main__':
    main()