<?php
if (!defined('ABSPATH')) { exit; }

final class ASG_ML {
    private $stopwords;

    public function __construct() {
        $this->stopwords = array_flip(array(
            'the','and','for','that','this','with','you','your','from','are','was','were','will','have','has','had',
            'not','but','they','them','their','our','ours','can','could','would','should','a','an','to','of','in','on',
            'at','as','is','it','be','or','by','we','i','me','my'
        ));
    }

    public function score_text($text) {
        $text = (string) $text;
        $text = wp_strip_all_tags($text);
        $tokens = $this->tokenize($text);
        if (count($tokens) < 3) { return 0; }

        $totals = get_option('asg_ml_totals', array('spam_docs'=>0,'ham_docs'=>0));
        $spam_docs = isset($totals['spam_docs']) ? max(0, intval($totals['spam_docs'])) : 0;
        $ham  = (int) $wpdb->get_var( $wpdb->prepare( 'SELECT SUM(ham_count) FROM %i WHERE 1=%d', $table, 1 ) ); // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching

        $map = array();
        foreach ($rows as $r) {
            $map[$r['token']] = array('s'=>(int)$r['spam_count'], 'h'=>(int)$r['ham_count']);
        }

        $p_spam = ($spam_docs + 1) / ($spam_docs + $ham_docs + 2);
        $p_ham  = ($ham_docs + 1) / ($spam_docs + $ham_docs + 2);

        $log_spam = log($p_spam);
        $log_ham  = log($p_ham);

        $vocab = $this->vocab_size();

        $class_totals = $this->class_token_totals();

        foreach ($unique as $t) {
            $s = isset($map[$t]) ? $map[$t]['s'] : 0;
            $h = isset($map[$t]) ? $map[$t]['h'] : 0;

            $ps = ($s + 1) / ($class_totals['spam'] + $vocab);
            $ph = ($h + 1) / ($class_totals['ham']  + $vocab);

            $log_spam += log($ps);
            $log_ham  += log($ph);
        }

        $diff = $log_spam - $log_ham;
        if ($diff > 8) $diff = 8;
        if ($diff < -8) $diff = -8;

        $prob_spam = 1 / (1 + exp(-$diff)); // 0..1
        return ($prob_spam * 2) - 1;
    }

    public function train_text($text, $is_spam) {
        $text = (string) $text;
        $text = wp_strip_all_tags($text);
        $tokens = $this->tokenize($text);
        if (!$tokens) { return; }

        global $wpdb;
        $table = self::sanitize_db_prefix($wpdb->prefix) . 'asg_ml_tokens';
        $totals = get_option('asg_ml_totals', array('spam_docs'=>0,'ham_docs'=>0));
        if (!is_array($totals)) { $totals = array('spam_docs'=>0,'ham_docs'=>0); }
        if ($is_spam) { $totals['spam_docs'] = intval($totals['spam_docs']) + 1; }
        else { $totals['ham_docs'] = intval($totals['ham_docs']) + 1; }
        update_option('asg_ml_totals', $totals, false);

        $unique = array_values(array_unique($tokens));
        foreach ($unique as $tok) {
            if ($tok === '') { continue; }
            $cache_key = 'asg_ml_tok_' . md5( (string) $table . '|' . (string) $tok );
            $existing  = wp_cache_get( $cache_key, 'asg_ml' );
            if ( false === $existing ) {
                $existing = $wpdb->get_row( // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery
                    $wpdb->prepare(
                        'SELECT token, spam_count, ham_count FROM %i WHERE token=%s',
                        $table,
                        $tok
                    ),
                    ARRAY_A
                );
                wp_cache_set( $cache_key, $existing, 'asg_ml', 3600 );
            }
            if (!$existing) {
                $wpdb->insert($table, array( // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
                    'token' => $tok,
                    'spam_count' => $is_spam ? 1 : 0,
                    'ham_count'  => $is_spam ? 0 : 1,
                ), array('%s','%d','%d'));
            } else {
                $wpdb->update($table, array( // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
                    'spam_count' => (int)$existing['spam_count'] + ($is_spam ? 1 : 0),
                    'ham_count'  => (int)$existing['ham_count']  + ($is_spam ? 0 : 1),
                ), array('token'=>$tok), array('%d','%d'), array('%s'));
            }
        }

        delete_transient('asg_ml_vocab');
        delete_transient('asg_ml_class_totals');
    }

    private function tokenize($text) {
        $text = strtolower($text);
        $text = preg_replace('/https?:\/\/[^\s]+/i', ' __url__ ', $text);
        $text = preg_replace('/[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}/i', ' __email__ ', $text);
        $parts = preg_split('/[^a-z0-9_]+/i', $text);
        $out = array();
        foreach ($parts as $p) {
            $p = trim($p);
            if ($p === '' || strlen($p) < 3) { continue; }
            if (isset($this->stopwords[$p])) { continue; }
            if (preg_match('/^\d+$/', $p)) { $p = '__num__'; }
            $out[] = $p;
        }
        $links = preg_match_all('/__url__/', $text, $m);
        if ($links >= 2) { $out[] = '__many_urls__'; }
        if ($links >= 5) { $out[] = '__lots_urls__'; }

        $max = 28;
        $base = array_slice($out, 0, $max);
        $n = count($base);
        if ($n >= 2) {
            for ($i=0; $i<$n-1; $i++) {
                $out[] = $base[$i] . '_' . $base[$i+1];
                if ($i < $n-2) { $out[] = $base[$i] . '_' . $base[$i+1] . '_' . $base[$i+2]; }
            }
        }

        return $out;
    }

    private function vocab_size() {
        $cached = get_transient('asg_ml_vocab');
        if (is_numeric($cached) && $cached > 0) { return (int)$cached; }

        global $wpdb;
        $table = self::sanitize_db_prefix($wpdb->prefix) . 'asg_ml_tokens';
        $v = (int) $wpdb->get_var( $wpdb->prepare( 'SELECT COUNT(*) FROM %i WHERE 1=%d', $table, 1 ) ); // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
        if ($v < 100) { $v = 100; } 
        set_transient('asg_ml_vocab', $v, HOUR_IN_SECONDS);
        return $v;
    }

    private function class_token_totals() {
        $cached = get_transient('asg_ml_class_totals');
        if (is_array($cached) && isset($cached['spam']) && isset($cached['ham'])) { return $cached; }

        global $wpdb;
        $table = self::sanitize_db_prefix($wpdb->prefix) . 'asg_ml_tokens';
        $spam = (int) $wpdb->get_var( $wpdb->prepare( 'SELECT SUM(spam_count) FROM %i WHERE 1=%d', $table, 1 ) ); // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
        $ham  = (int) $wpdb->get_var( $wpdb->prepare( 'SELECT SUM(ham_count) FROM %i WHERE 1=%d', $table, 1 ) ); // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
        $out = array('spam'=>max(0,$spam), 'ham'=>max(0,$ham));
        set_transient('asg_ml_class_totals', $out, HOUR_IN_SECONDS);
        return $out;
    }


    /**
     * Sanitize a DB prefix for safe identifier usage.
     *
     * @param string $prefix Raw $wpdb->prefix.
     * @return string Sanitized prefix (identifier-safe).
     */
    private static function sanitize_db_prefix( $prefix ) {
        $prefix = (string) $prefix;
        // Allow only identifier-safe chars.
        $prefix = preg_replace( '/[^A-Za-z0-9_]/', '', $prefix );
        if ( '' === $prefix ) {
            return '';
        }
        return $prefix;
    }
}
