import os
import math
import datetime
import xml.etree.ElementTree as ET
from xml.dom import minidom

# --- CONFIGURATION (UPDATED FOR YOUR SERVER PATHS) ---
# This points to the folder where your 1687+ job files are actually stored
SOURCE_HTML_DIR = r"/home/remotecurrentjob/public_html/rowjobs.site/job"

# This saves the sitemaps to the main website folder so they are live on the web
OUTPUT_DIR = r"/home/remotecurrentjob/public_html/rowjobs.site"

# This matches the professional URL structure
BASE_URL = "https://rowjobs.site/job/"

MAX_URLS_PER_SITEMAP = 5000
SITEMAP_FILENAME_PREFIX = "sitemap"
SITEMAP_INDEX_FILENAME = "sitemap_index.xml"

# SEO Settings
INCLUDE_CHANGEFREQ_PRIORITY = True
DEFAULT_CHANGEFREQ = "daily"
DEFAULT_PRIORITY = "0.9"

# Force current date to tell Google these jobs are brand new/active today
FORCE_CURRENT_DATE_FOR_LASTMOD = True
SITEMAP_ONLY_TODAY_URLS = False 

# --- END CONFIGURATION ---

def prettify_xml(elem):
    """Return a pretty-printed XML string for the Element."""
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ", encoding="utf-8")

def create_sitemap_xml(urls):
    urlset = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    current_date = datetime.date.today().isoformat()

    for url in urls:
        url_elem = ET.SubElement(urlset, "url")
        ET.SubElement(url_elem, "loc").text = url
        ET.SubElement(url_elem, "lastmod").text = current_date
        if INCLUDE_CHANGEFREQ_PRIORITY:
            ET.SubElement(url_elem, "changefreq").text = DEFAULT_CHANGEFREQ
            ET.SubElement(url_elem, "priority").text = DEFAULT_PRIORITY
    return urlset

def main():
    if not os.path.exists(SOURCE_HTML_DIR):
        print(f"Error: Directory {SOURCE_HTML_DIR} not found. Ensure the path is correct.")
        return

    # Collect all .html files from the /job/ folder
    html_files = [f for f in os.listdir(SOURCE_HTML_DIR) if f.endswith(".html")]
    
    if not html_files:
        print(f"No HTML files found in: {SOURCE_HTML_DIR}")
        return

    total_sitemaps = math.ceil(len(html_files) / MAX_URLS_PER_SITEMAP)
    sitemap_files = []

    for i in range(total_sitemaps):
        start = i * MAX_URLS_PER_SITEMAP
        end = start + MAX_URLS_PER_SITEMAP
        batch = html_files[start:end]
        
        # Build full URLs: domain.com/job/filename.html
        full_urls = [f"{BASE_URL}{f}" for f in batch]
        
        sitemap_xml = create_sitemap_xml(full_urls)
        filename = f"{SITEMAP_FILENAME_PREFIX}_{i+1}.xml"
        filepath = os.path.join(OUTPUT_DIR, filename)
        
        with open(filepath, "wb") as f:
            f.write(prettify_xml(sitemap_xml))
        
        sitemap_files.append(filename)
        print(f"Generated: {filename} with {len(batch)} URLs")

    # Create Sitemap Index
    sitemapindex = ET.Element("sitemapindex", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    domain_base = "https://rowjobs.site" # Direct Root Domain
    
    for sm_file in sitemap_files:
        sitemap_elem = ET.SubElement(sitemapindex, "sitemap")
        ET.SubElement(sitemap_elem, "loc").text = f"{domain_base}/{sm_file}"
        ET.SubElement(sitemap_elem, "lastmod").text = datetime.date.today().isoformat()

    index_path = os.path.join(OUTPUT_DIR, SITEMAP_INDEX_FILENAME)
    with open(index_path, "wb") as f:
        f.write(prettify_xml(sitemapindex))

    print(f"\nSUCCESS: Sitemap Index created at {index_path}")

if __name__ == "__main__":
    main()