<?php
/**
 * CSV Type Detector for Screaming Fixes
 *
 * Auto-detects the type of Screaming Frog CSV export based on headers
 */

if (!defined('ABSPATH')) {
    exit;
}

class SF_CSV_Detector {

    /**
     * CSV type signatures - maps header patterns to module IDs
     * @var array
     */
    private $signatures = [];

    /**
     * Constructor
     */
    public function __construct() {
        $this->register_signatures();
    }

    /**
     * Register CSV type signatures
     */
    private function register_signatures() {
        // Broken Links - Screaming Frog "All Inlinks" or "Client Error (4xx)" export
        $this->signatures['broken-links'] = [
            'required' => ['destination'],
            'optional' => ['status code', 'source', 'type', 'anchor'],
            'patterns' => [
                // All Inlinks export
                ['destination', 'source'],
                // Client Error export
                ['destination', 'status code'],
                // Response Codes export
                ['address', 'status code'],
            ],
        ];

        // Redirect Chains - Screaming Frog "Redirect Chains" export
        // Unique columns: Final Address, Number of Redirects, Chain Type
        $this->signatures['redirect-chains'] = [
            'required' => ['final address'],
            'optional' => ['address', 'source', 'number of redirects', 'chain type'],
            'patterns' => [
                ['final address', 'number of redirects', 'chain type'],
                ['final address', 'number of redirects'],
                ['final address', 'address', 'source'],
            ],
        ];

        // Images - Screaming Frog "Images" export (All Inlinks from Images tab)
        $this->signatures['image-alt-text'] = [
            'required' => ['alt text'],
            'optional' => ['destination', 'src', 'source', 'type', 'link position', 'size'],
            'patterns' => [
                // Screaming Frog Images export with inlinks
                ['destination', 'alt text'],
                ['destination', 'alt text', 'source'],
                // Direct image exports
                ['src', 'alt text'],
                ['image url', 'alt text'],
                // Type-based detection
                ['destination', 'alt text', 'type'],
            ],
        ];

        // Meta Description - Screaming Frog "Meta Description" export
        $this->signatures['meta-description'] = [
            'required' => ['address'],
            'optional' => ['meta description 1', 'meta description 1 length', 'meta description 1 pixel width', 'occurrences', 'indexability'],
            'patterns' => [
                // Standard Meta Description export
                ['address', 'meta description 1'],
                ['address', 'meta description'],
                // With length info
                ['address', 'meta description 1', 'meta description 1 length'],
            ],
        ];

        // Page Title - Screaming Frog "Page Titles" export
        $this->signatures['page-title'] = [
            'required' => ['address'],
            'optional' => ['title 1', 'title 1 length', 'title 1 pixel width', 'occurrences', 'indexability'],
            'patterns' => [
                // Standard Page Titles export
                ['address', 'title 1'],
                // With length info
                ['address', 'title 1', 'title 1 length'],
            ],
        ];

        // Internal Link Builder - Bulk upload CSV format
        $this->signatures['internal-link-builder'] = [
            'required' => ['anchor'],
            'optional' => ['source', 'target'],
            'patterns' => [
                // Standard format: Source_URL, Anchor_Text, Target_URL
                ['source', 'anchor', 'target'],
                // Alternative column names
                ['source_url', 'anchor_text', 'target_url'],
                ['from', 'anchor', 'to'],
                ['page', 'keyword', 'link'],
            ],
        ];

        // Allow modules to register additional signatures
        $this->signatures = apply_filters('sf_csv_signatures', $this->signatures);
    }

    /**
     * Detect CSV type from file
     *
     * @param string $file_path Path to CSV file
     * @return string Module ID or 'unknown'
     */
    public function detect($file_path) {
        $headers = $this->get_headers($file_path);

        if (!$headers) {
            return 'unknown';
        }

        return $this->detect_from_headers($headers);
    }

    /**
     * Detect CSV type from headers array
     *
     * @param array $headers CSV column headers
     * @return string Module ID or 'unknown'
     */
    public function detect_from_headers($headers) {
        // Normalize headers
        $normalized = array_map(function ($header) {
            return strtolower(trim($header));
        }, $headers);

        // Check for specific CSV types in order of specificity (most specific first)

        // Internal Link Builder CSV has: source_url/source, anchor_text/anchor, target_url/target
        // Check this FIRST because it's a specific format for bulk link building
        if ($this->is_internal_link_builder_csv($normalized)) {
            return 'internal-link-builder';
        }

        // Redirect Chains CSV has: final address, number of redirects, chain type
        // This is very specific and should be checked first
        if ($this->is_redirect_chains_csv($normalized)) {
            return 'redirect-chains';
        }

        // Image Alt Text CSV has: destination, alt text, source, type, real dimensions
        // This is MORE specific than Broken Links, so check it BEFORE broken links
        if ($this->is_image_alt_text_csv($normalized)) {
            return 'image-alt-text';
        }

        // Page Title CSV has: address, title 1, title 1 length
        // Check BEFORE meta-description since both have 'address' column
        if ($this->is_page_title_csv($normalized)) {
            return 'page-title';
        }

        // Meta Description CSV has: address, meta description 1, occurrences
        // Check BEFORE broken-links since both have 'address' column
        if ($this->is_meta_description_csv($normalized)) {
            return 'meta-description';
        }

        // Check each signature for remaining types
        foreach ($this->signatures as $module_id => $signature) {
            // Skip types we already checked with specific methods
            if (in_array($module_id, ['internal-link-builder', 'image-alt-text', 'redirect-chains', 'meta-description', 'page-title'], true)) {
                continue;
            }
            if ($this->matches_signature($normalized, $signature)) {
                return $module_id;
            }
        }

        // Try to detect by pattern matching
        $detected = $this->detect_by_patterns($normalized);

        if ($detected) {
            return $detected;
        }

        return 'unknown';
    }

    /**
     * Check if CSV is specifically an Image Alt Text export
     *
     * Both Broken Links and Image Alt Text CSVs have "Alt Text" column.
     * The key differentiator is image-specific columns that ONLY exist in Images export:
     * - Size (Bytes)
     * - Real Dimensions
     * - Dimensions in Attributes
     * - Display Dimensions
     * - Potential Savings (bytes)
     *
     * @param array $headers Normalized headers
     * @return bool
     */
    private function is_image_alt_text_csv($headers) {
        // These columns ONLY exist in Screaming Frog's Images export
        // Both Broken Links and Images have: Size (Bytes), Alt Text
        // But ONLY Images has these dimension-related columns:
        $image_only_columns = [
            'real dimensions',
            'dimensions in attributes',
            'display dimensions',
            'potential savings (bytes)',
        ];

        foreach ($headers as $header) {
            foreach ($image_only_columns as $img_col) {
                if (strpos($header, $img_col) !== false) {
                    // Found an image-only column - this is definitely an Images export
                    return true;
                }
            }
        }

        // Also detect as image-alt-text when CSV has "alt text" column
        // with "destination" or "source", but NO broken-link-specific columns
        // like "status code". This handles simplified image CSVs without
        // dimension columns (e.g., Source, Destination, Alt Text).
        $has_alt_text = false;
        $has_destination_or_source = false;
        $has_status_code = false;

        foreach ($headers as $header) {
            if (strpos($header, 'alt text') !== false) {
                $has_alt_text = true;
            }
            if ($header === 'destination' || $header === 'source') {
                $has_destination_or_source = true;
            }
            if (strpos($header, 'status code') !== false || strpos($header, 'status') !== false) {
                $has_status_code = true;
            }
        }

        if ($has_alt_text && $has_destination_or_source && !$has_status_code) {
            return true;
        }

        return false;
    }

    /**
     * Check if CSV is specifically a Meta Description export
     *
     * Meta Description CSV has unique columns:
     * - Address
     * - Meta Description 1
     * - Meta Description 1 Length
     * - Occurrences
     *
     * @param array $headers Normalized headers
     * @return bool
     */
    private function is_meta_description_csv($headers) {
        // These columns are unique to Screaming Frog's Meta Description export
        $meta_desc_columns = [
            'meta description 1',
            'meta description 1 length',
            'meta description 1 pixel width',
        ];

        $found_count = 0;
        $has_address = false;

        foreach ($headers as $header) {
            // Check for address column
            if ($header === 'address') {
                $has_address = true;
            }

            // Check for meta description specific columns
            foreach ($meta_desc_columns as $md_col) {
                if (strpos($header, $md_col) !== false) {
                    $found_count++;
                    break;
                }
            }
        }

        // Must have address AND at least one meta description specific column
        return $has_address && $found_count >= 1;
    }

    /**
     * Check if CSV is specifically a Page Title export
     *
     * Page Title CSV has unique columns:
     * - Address
     * - Title 1
     * - Title 1 Length
     * - Title 1 Pixel Width
     *
     * @param array $headers Normalized headers
     * @return bool
     */
    private function is_page_title_csv($headers) {
        // These columns are unique to Screaming Frog's Page Titles export
        $page_title_columns = [
            'title 1',
            'title 1 length',
            'title 1 pixel width',
        ];

        $found_count = 0;
        $has_address = false;

        foreach ($headers as $header) {
            // Check for address column
            if ($header === 'address') {
                $has_address = true;
            }

            // Check for page title specific columns
            foreach ($page_title_columns as $pt_col) {
                if (strpos($header, $pt_col) !== false) {
                    $found_count++;
                    break;
                }
            }
        }

        // Must have address AND at least one page title specific column
        return $has_address && $found_count >= 1;
    }

    /**
     * Check if CSV is specifically an Internal Link Builder bulk upload
     *
     * Supports two formats:
     * 1. Standard format: Source_URL, Anchor_Text, Target_URL
     * 2. Orphan Pages format: URL (target), Source (ignored), Source_URL, Anchor_Text
     *
     * @param array $headers Normalized headers
     * @return bool
     */
    private function is_internal_link_builder_csv($headers) {
        // Patterns that indicate source column (where the link will be added)
        $source_patterns = ['source_url', 'sourceurl', 'from_url', 'fromurl', 'page_url', 'pageurl'];
        // Patterns that indicate anchor text column
        $anchor_patterns = ['anchor_text', 'anchor', 'anchortext', 'link_text', 'linktext'];
        // Patterns that indicate target column (where the link points to)
        $target_patterns = ['target_url', 'target', 'targeturl', 'destination', 'to_url', 'link_url', 'linkurl'];

        $has_source = false;
        $has_anchor = false;
        $has_target = false;
        $has_url_column = false;

        foreach ($headers as $header) {
            // Check for exact 'url' column (Orphan Pages format from Screaming Frog)
            if ($header === 'url') {
                $has_url_column = true;
            }

            // Check source patterns
            foreach ($source_patterns as $pattern) {
                if ($header === $pattern || strpos($header, $pattern) !== false) {
                    $has_source = true;
                    break;
                }
            }

            // Check anchor patterns
            foreach ($anchor_patterns as $pattern) {
                if ($header === $pattern || strpos($header, $pattern) !== false) {
                    $has_anchor = true;
                    break;
                }
            }

            // Check target patterns
            foreach ($target_patterns as $pattern) {
                if ($header === $pattern || strpos($header, $pattern) !== false) {
                    $has_target = true;
                    break;
                }
            }
        }

        // Standard format: has source, anchor, and target columns
        if ($has_source && $has_anchor && $has_target) {
            return true;
        }

        // Orphan Pages format: has URL column (as target), source_url, and anchor_text
        // The 'url' column from Screaming Frog export serves as the target URL
        if ($has_url_column && $has_source && $has_anchor) {
            return true;
        }

        return false;
    }

    /**
     * Check if CSV is specifically a Redirect Chains export
     *
     * Redirect Chains CSV has unique columns:
     * - Final Address
     * - Number of Redirects
     * - Chain Type
     *
     * @param array $headers Normalized headers
     * @return bool
     */
    private function is_redirect_chains_csv($headers) {
        // These columns ONLY exist in Screaming Frog's Redirect Chains export
        $redirect_chains_columns = [
            'final address',
            'number of redirects',
            'chain type',
        ];

        $found_count = 0;
        foreach ($headers as $header) {
            foreach ($redirect_chains_columns as $rc_col) {
                if (strpos($header, $rc_col) !== false) {
                    $found_count++;
                    break;
                }
            }
        }

        // If we find at least 2 of these unique columns, it's a Redirect Chains CSV
        return $found_count >= 2;
    }

    /**
     * Check if headers match a signature
     *
     * @param array $headers Normalized headers
     * @param array $signature Signature definition
     * @return bool
     */
    private function matches_signature($headers, $signature) {
        // Check required headers
        foreach ($signature['required'] as $required) {
            $found = false;
            foreach ($headers as $header) {
                if (strpos($header, $required) !== false) {
                    $found = true;
                    break;
                }
            }
            if (!$found) {
                return false;
            }
        }

        // Check patterns - at least one pattern must match
        foreach ($signature['patterns'] as $pattern) {
            $pattern_matches = true;
            foreach ($pattern as $pattern_header) {
                $found = false;
                foreach ($headers as $header) {
                    if (strpos($header, $pattern_header) !== false) {
                        $found = true;
                        break;
                    }
                }
                if (!$found) {
                    $pattern_matches = false;
                    break;
                }
            }
            if ($pattern_matches) {
                return true;
            }
        }

        return false;
    }

    /**
     * Detect CSV type by common patterns
     *
     * @param array $headers Normalized headers
     * @return string|null Module ID or null
     */
    private function detect_by_patterns($headers) {
        $headers_str = implode(' ', $headers);

        // Check for image-specific columns (these ONLY exist in Images export)
        // Note: "size (bytes)" exists in BOTH Broken Links and Images CSVs, so we don't check for it
        $has_image_specific = (
            strpos($headers_str, 'real dimensions') !== false ||
            strpos($headers_str, 'dimensions in attributes') !== false ||
            strpos($headers_str, 'display dimensions') !== false ||
            strpos($headers_str, 'potential savings') !== false
        );

        // Check for image patterns - must have image-specific columns
        if ($has_image_specific && strpos($headers_str, 'alt') !== false) {
            return 'image-alt-text';
        }

        // Check for broken links patterns (destination + status, without image-specific columns)
        if (
            (strpos($headers_str, 'destination') !== false || strpos($headers_str, 'url') !== false) &&
            (strpos($headers_str, 'status') !== false || strpos($headers_str, '404') !== false ||
             strpos($headers_str, 'broken') !== false) &&
            !$has_image_specific
        ) {
            return 'broken-links';
        }

        // Check for redirect chain patterns
        if (
            strpos($headers_str, 'redirect') !== false &&
            (strpos($headers_str, 'chain') !== false || strpos($headers_str, 'hop') !== false ||
             strpos($headers_str, 'final') !== false)
        ) {
            return 'redirect-chains';
        }

        // Check for meta description patterns
        if (
            strpos($headers_str, 'meta description') !== false &&
            strpos($headers_str, 'address') !== false
        ) {
            return 'meta-description';
        }

        // Check for page title patterns
        if (
            strpos($headers_str, 'title 1') !== false &&
            strpos($headers_str, 'address') !== false
        ) {
            return 'page-title';
        }

        return null;
    }

    /**
     * Get headers from CSV file
     *
     * @param string $file_path Path to CSV file
     * @return array|false Headers or false on error
     */
    public function get_headers($file_path) {
        if (!file_exists($file_path)) {
            return false;
        }

        $handle = fopen($file_path, 'r');

        if (!$handle) {
            return false;
        }

        // Try to detect BOM and skip it
        $bom = fread($handle, 3);
        if ($bom !== "\xEF\xBB\xBF") {
            rewind($handle);
        }

        $headers = fgetcsv($handle);
        fclose($handle);

        if (!$headers || !is_array($headers)) {
            return false;
        }

        // Clean headers
        return array_map(function ($header) {
            // Remove BOM characters and trim
            $header = preg_replace('/[\x00-\x1F\x80-\xFF]/', '', $header);
            return trim($header);
        }, $headers);
    }

    /**
     * Get expected columns for a module
     *
     * @param string $module_id Module identifier
     * @return array Expected columns info
     */
    public function get_expected_columns($module_id) {
        if (!isset($this->signatures[$module_id])) {
            return [];
        }

        return [
            'required' => $this->signatures[$module_id]['required'],
            'optional' => $this->signatures[$module_id]['optional'],
        ];
    }

    /**
     * Validate CSV has expected columns for a module
     *
     * @param string $file_path Path to CSV file
     * @param string $module_id Expected module
     * @return true|WP_Error
     */
    public function validate_for_module($file_path, $module_id) {
        $headers = $this->get_headers($file_path);

        if (!$headers) {
            return new WP_Error(
                'invalid_csv',
                __('Could not read CSV file headers.', 'screaming-fixes')
            );
        }

        $detected = $this->detect_from_headers($headers);

        if ($detected !== $module_id) {
            return new WP_Error(
                'wrong_csv_type',
                sprintf(
                    __('This CSV appears to be for %s, not %s.', 'screaming-fixes'),
                    $this->get_module_label($detected),
                    $this->get_module_label($module_id)
                )
            );
        }

        return true;
    }

    /**
     * Get human-readable label for module ID
     *
     * @param string $module_id Module identifier
     * @return string Label
     */
    private function get_module_label($module_id) {
        $labels = [
            'broken-links' => __('Broken Links', 'screaming-fixes'),
            'redirect-chains' => __('Redirect Chains', 'screaming-fixes'),
            'image-alt-text' => __('Image Alt Text', 'screaming-fixes'),
            'meta-description' => __('Meta Description', 'screaming-fixes'),
            'page-title' => __('Page Title', 'screaming-fixes'),
            'internal-link-builder' => __('Internal Link Builder', 'screaming-fixes'),
            'backlink-reclaim' => __('Backlink Reclaim', 'screaming-fixes'),
            'unknown' => __('Unknown', 'screaming-fixes'),
        ];

        return $labels[$module_id] ?? $module_id;
    }

    /**
     * Get all registered CSV types
     *
     * @return array Module IDs
     */
    public function get_registered_types() {
        return array_keys($this->signatures);
    }

    /**
     * Check if module can handle CSV based on headers
     * This is a static helper for modules to use
     *
     * @param array $headers CSV headers
     * @param array $required_headers Required header patterns
     * @return bool
     */
    public static function headers_contain($headers, $required_headers) {
        $normalized = array_map('strtolower', array_map('trim', $headers));

        foreach ($required_headers as $required) {
            $found = false;
            foreach ($normalized as $header) {
                if (strpos($header, strtolower($required)) !== false) {
                    $found = true;
                    break;
                }
            }
            if (!$found) {
                return false;
            }
        }

        return true;
    }
}
