# -*- coding: utf-8 -*-
import requests
import json
import re
import os
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
import datetime
from bs4 import BeautifulSoup

lock = threading.Lock()
OUTPUT_FILE = "jobs.ndjson"
CHECKPOINT_FILE = "checkpoint.txt"
SITEMAP_LOCAL = "sitemap_local.xml" 
MY_SPECIFIC_APPLY_LINK = "https://rowjobs.site" 

def extract_job_postings(html_content):
    if "Just a moment..." in html_content or "verify_checkpoint" in html_content:
        return None # Blocked!

    soup = BeautifulSoup(html_content, 'html.parser')
    title_tag = soup.find('h1') or soup.find('title')
    title = title_tag.get_text().strip() if title_tag else "Remote Job"
    title = title.split('|')[0].split('-')[0].strip()
    
    # Target Shopify Content
    desc_tag = soup.find('div', class_='article-template__content') or \
               soup.find('div', class_='rte') or \
               soup.find('article')
    
    description = str(desc_tag) if desc_tag else "Professional remote opportunity."
    return [{"title": title, "description": description}]

def process_and_save(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    }
    try:
        # Using a small delay to not trigger the firewall
        time.sleep(0.5) 
        res = requests.get(url, headers=headers, timeout=20)
        
        job_data = extract_job_postings(res.text)
        if job_data:
            with lock:
                with open(OUTPUT_FILE, "a", encoding="utf-8") as f:
                    for j in job_data:
                        f.write(json.dumps({
                            "title": j["title"],
                            "description": j["description"],
                            "apply_url": MY_SPECIFIC_APPLY_LINK,
                            "date": datetime.date.today().isoformat()
                        }, ensure_ascii=False) + "\n")
                with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f:
                    f.write(url + "\n")
            return True
        return False
    except:
        return False

def main():
    if not os.path.exists(SITEMAP_LOCAL):
        print(f"Error: {SITEMAP_LOCAL} missing!")
        return

    with open(SITEMAP_LOCAL, "r", encoding="utf-8") as f:
        all_urls = re.findall(r'<loc>(.*?)</loc>', f.read())
    
    job_urls = [u for u in all_urls if "/blogs/news/" in u]
    unique_urls = list(set(job_urls))

    processed = set()
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            processed = {line.strip() for line in f if line.strip()}
    
    remaining = [u for u in unique_urls if u not in processed]
    print(f"Starting Scrape: {len(remaining)} jobs to go...")

    # Lowering max_workers to 3 to avoid triggering the "Just a moment" screen
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = {executor.submit(process_and_save, url): url for url in remaining}
        for i, _ in enumerate(as_completed(futures), 1):
            if i % 10 == 0:
                print(f"Progress: {i}/{len(remaining)} URLs processed...")

if __name__ == "__main__":
    main()