hvac-kia-content/src/tiktok_scraper_advanced.py

import os
import time
import random
from typing import Any, Dict, List, Optional
from datetime import datetime, timedelta
from pathlib import Path
import json
import re
from scrapling import StealthyFetcher, Adaptor
from src.base_scraper import BaseScraper, ScraperConfig


class TikTokScraperAdvanced(BaseScraper):
    """TikTok scraper using advanced Scrapling configuration for bot detection avoidance."""

    def __init__(self, config: ScraperConfig):
        super().__init__(config)
        self.target_username = os.getenv('TIKTOK_TARGET', 'hkia')
        self.base_url = f"https://www.tiktok.com/@{self.target_username}"

        # Configure global StealthyFetcher settings
        StealthyFetcher.auto_match = True  # Enable automatic element matching
        StealthyFetcher.huge_tree = True   # Allow large HTML trees

    def _enhanced_typing(self, element, text: str):
        """Realistic typing patterns (30-70 WPM with typos)"""
        for char in text:
            # Variable typing speed
            base_delay = random.uniform(0.08, 0.25)

            # Pause on complex characters
            if char in '@._-':
                base_delay *= random.uniform(1.2, 2.0)

            # Occasional hesitation (10% chance)
            if random.random() < 0.1:
                time.sleep(random.uniform(0.3, 0.8))

            element.type(char)
            time.sleep(base_delay)

            # Typo correction (3% chance)
            if random.random() < 0.03:
                element.press('Backspace')
                time.sleep(random.uniform(0.1, 0.3))
                element.type(char)

    def _advanced_human_simulation(self, page):
        """Natural page reading behavior"""
        try:
            viewport_height = page.viewport_size.get('height', 800)

            # Natural scrolling patterns
            for i in range(random.randint(3, 6)):
                scroll_amount = random.randint(100, viewport_height // 3)
                page.mouse.wheel(0, scroll_amount)
                time.sleep(random.uniform(0.8, 2.5))  # Reading time

                # Occasional back-scroll (re-reading)
                if random.random() < 0.3:
                    page.mouse.wheel(0, -random.randint(50, 150))

            # Random mouse movements
            for _ in range(random.randint(2, 4)):
                x = random.randint(100, page.viewport_size.get('width', 1200) - 100)
                y = random.randint(100, page.viewport_size.get('height', 800) - 100)
                page.mouse.move(x, y)
                time.sleep(random.uniform(0.3, 0.8))
        except Exception as e:
            self.logger.debug(f"Human simulation error (non-critical): {e}")

    def _human_delay(self, min_seconds: float = 2, max_seconds: float = 5) -> None:
        """Add human-like delays between actions."""
        delay = random.uniform(min_seconds, max_seconds)
        self.logger.debug(f"Waiting {delay:.2f} seconds (human-like delay)...")
        time.sleep(delay)

    def fetch_posts(self, max_posts: int = 20, enable_scrolling: bool = True) -> List[Dict[str, Any]]:
        """Fetch posts from TikTok profile using advanced stealth configuration.

        Args:
            max_posts: Maximum number of posts to fetch
            enable_scrolling: Whether to scroll profile page to load more videos
        """
        posts_data = []

        try:
            self.logger.info(f"Fetching TikTok posts from @{self.target_username}")

            # Advanced stealth configuration for TikTok
            self.logger.info(f"Loading {self.base_url} with advanced stealth settings...")
            response = StealthyFetcher.fetch(
                url=self.base_url,

                # Display and stealth settings
                headless=False,  # Visible browser for manual CAPTCHA intervention

                # Network and resource management
                block_webrtc=True,  # Prevent WebRTC IP leaks
                allow_webgl=True,   # CRITICAL: Required for modern anti-bot detection
                block_images=False, # Keep images for CAPTCHA visibility
                disable_ads=True,   # Block ads for cleaner experience
                disable_resources=False,  # Keep all resources to avoid detection

                # Geographic and fingerprinting
                geoip=True,         # Automatic geolocation spoofing
                os_randomize=True,  # Randomize OS fingerprints
                google_search=True, # Set Google as referrer

                # Humanization and behavior
                humanize=True,      # Enable human-like mouse movements

                # Performance and timing
                network_idle=True,  # Wait for network idle state
                timeout=120000,     # 2 minute timeout (reduced for testing)
                wait=3000,          # 3 second wait after page load

                # Enhanced headers for better compatibility
                extra_headers={
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.9,en-CA;q=0.8",
                    "Accept-Encoding": "gzip, deflate, br",
                    "Cache-Control": "max-age=0",
                    "DNT": "1",
                    "Upgrade-Insecure-Requests": "1",
                    "Sec-Fetch-Dest": "document",
                    "Sec-Fetch-Mode": "navigate",
                    "Sec-Fetch-Site": "none",
                    "Sec-Fetch-User": "?1"
                }
            )

            if not response:
                self.logger.error("Failed to load TikTok profile")
                return posts_data

            self.logger.info("Page loaded successfully, performing human simulation...")

            # Perform advanced human simulation if we have access to the page object
            try:
                # Note: This would need to be adapted based on Scrapling's API
                # self._advanced_human_simulation(page)
                pass
            except Exception as e:
                self.logger.debug(f"Human simulation not available: {e}")

            # Wait for human-like delay
            self._human_delay(3, 6)

            # Optional: Scroll to load more videos
            if enable_scrolling and max_posts > 20:
                self.logger.info(f"Scrolling to load more videos (targeting {max_posts} posts)...")
                # Simulate scrolling to trigger lazy loading
                for scroll_attempt in range(min(5, max_posts // 10)):
                    try:
                        # Scroll down progressively
                        self.logger.debug(f"Scroll attempt {scroll_attempt + 1}")
                        # Note: This would need adaptation based on Scrapling's API
                        # for actual scrolling implementation
                        self._human_delay(2, 4)
                    except Exception as e:
                        self.logger.debug(f"Scrolling error (non-critical): {e}")
                        break

            # Extract video items using multiple strategies
            video_items = []

            # Strategy 1: Primary TikTok selectors
            video_items = response.css("[data-e2e='user-post-item']")
            self.logger.info(f"Strategy 1 found {len(video_items)} items with user-post-item selector")

            # Strategy 2: Alternative selectors
            if not video_items:
                video_items = response.css("div[class*='DivItemContainer']")
                self.logger.info(f"Strategy 2 found {len(video_items)} items with DivItemContainer selector")

            if not video_items:
                video_items = response.css("div[class*='video-feed-item']")
                self.logger.info(f"Strategy 3 found {len(video_items)} items with video-feed-item selector")

            # Strategy 3: Look for video links directly
            if not video_items:
                video_links = response.css("a[href*='/video/']")
                self.logger.info(f"Strategy 4 found {len(video_links)} direct video links")

                for idx, link in enumerate(video_links[:max_posts]):
                    try:
                        href = ""
                        # Extract href using ::attr() pseudo-selector
                        href_elements = response.css(f"a[href*='/video/']:nth-child({idx+1})::attr(href)")
                        if href_elements:
                            href = href_elements[0]

                        if not href:
                            continue

                        if not href.startswith('http'):
                            href = f"https://www.tiktok.com{href}"

                        video_id_match = re.search(r'/video/(\d+)', href)
                        video_id = video_id_match.group(1) if video_id_match else f"video_{idx}"

                        post_data = {
                            'id': video_id,
                            'type': 'video',
                            'caption': '',
                            'author': self.target_username,
                            'publish_date': datetime.now(self.tz).isoformat(),
                            'link': href,
                            'views': 0,
                            'platform': 'tiktok'
                        }

                        posts_data.append(post_data)

                    except Exception as e:
                        self.logger.error(f"Error processing video link {idx}: {e}")
                        continue

            # Strategy 4: Process structured video items
            if video_items and not posts_data:
                self.logger.info(f"Processing {len(video_items)} structured video items...")

                for idx, item in enumerate(video_items[:max_posts]):
                    try:
                        # Extract video URL using ::attr() selector
                        video_url = ""
                        href_elements = item.css("a[href*='/video/']::attr(href)")
                        if href_elements:
                            video_url = href_elements[0]

                        if not video_url:
                            # Try alternative approach
                            link_elements = item.css("a")
                            for link_elem in link_elements:
                                href_attrs = link_elem.css("::attr(href)")
                                if href_attrs and '/video/' in str(href_attrs[0]):
                                    video_url = href_attrs[0]
                                    break

                        if not video_url:
                            continue

                        if not video_url.startswith('http'):
                            video_url = f"https://www.tiktok.com{video_url}"

                        # Extract video ID from URL
                        video_id_match = re.search(r'/video/(\d+)', video_url)
                        video_id = video_id_match.group(1) if video_id_match else f"video_{idx}"

                        # Extract caption/description using ::text selector
                        caption = ""
                        caption_elements = item.css("div[data-e2e='browse-video-desc'] span::text")
                        if caption_elements:
                            caption = caption_elements[0] if isinstance(caption_elements, list) else str(caption_elements)

                        if not caption:
                            caption_elements = item.css("div[class*='DivContainer'] span::text")
                            if caption_elements:
                                caption = caption_elements[0] if isinstance(caption_elements, list) else str(caption_elements)

                        # Extract view count using ::text selector
                        views_text = "0"
                        views_elements = item.css("strong[data-e2e='video-views']::text")
                        if views_elements:
                            views_text = views_elements[0] if isinstance(views_elements, list) else str(views_elements)

                        if not views_text or views_text == "0":
                            views_elements = item.css("strong::text")
                            if views_elements:
                                views_text = views_elements[0] if isinstance(views_elements, list) else str(views_elements)

                        views = self._parse_count(views_text)

                        post_data = {
                            'id': video_id,
                            'type': 'video',
                            'caption': caption,
                            'author': self.target_username,
                            'publish_date': datetime.now(self.tz).isoformat(),
                            'link': video_url,
                            'views': views,
                            'platform': 'tiktok'
                        }

                        posts_data.append(post_data)

                        if idx % 5 == 0 and idx > 0:
                            self.logger.info(f"Processed {idx} videos...")

                    except Exception as e:
                        self.logger.error(f"Error processing video item {idx}: {e}")
                        continue

            # Strategy 5: Extract from page scripts as fallback
            if not posts_data:
                self.logger.info("No posts found via selectors, checking page scripts...")
                scripts = response.css("script")

                for script in scripts:
                    script_text_elements = script.css("::text")
                    if not script_text_elements:
                        continue

                    script_text = script_text_elements[0] if isinstance(script_text_elements, list) else str(script_text_elements)

                    if '__UNIVERSAL_DATA_FOR_REHYDRATION__' in script_text or 'window.__INIT_PROPS__' in script_text:
                        try:
                            # Look for video IDs in the script content
                            urls = re.findall(r'["\']*/video/(\d+)["\']', script_text)
                            unique_ids = list(set(urls))  # Remove duplicates

                            self.logger.info(f"Found {len(unique_ids)} unique video IDs in script data")

                            for video_id in unique_ids[:max_posts]:
                                post_data = {
                                    'id': video_id,
                                    'type': 'video',
                                    'caption': '',
                                    'author': self.target_username,
                                    'publish_date': datetime.now(self.tz).isoformat(),
                                    'link': f"https://www.tiktok.com/@{self.target_username}/video/{video_id}",
                                    'views': 0,
                                    'platform': 'tiktok'
                                }
                                posts_data.append(post_data)

                        except Exception as e:
                            self.logger.debug(f"Could not parse script data: {e}")
                            continue

            self.logger.info(f"Successfully fetched {len(posts_data)} TikTok posts")

        except Exception as e:
            self.logger.error(f"Error fetching TikTok posts: {e}")
            import traceback
            self.logger.error(traceback.format_exc())

        return posts_data

    def _fetch_video_details(self, video_url: str) -> Optional[Dict[str, Any]]:
        """Fetch detailed information from an individual TikTok video page.

        Args:
            video_url: URL of the TikTok video

        Returns:
            Dictionary with caption and additional metadata, or None if failed
        """
        try:
            self.logger.debug(f"Fetching details for: {video_url}")

            # Fetch individual video page with stealth settings
            video_response = StealthyFetcher.fetch(
                url=video_url,
                headless=False,
                block_webrtc=True,
                allow_webgl=True,
                block_images=False,
                disable_ads=True,
                geoip=True,
                os_randomize=True,
                google_search=True,
                humanize=True,
                network_idle=True,
                timeout=60000,  # 1 minute timeout for individual pages
                wait=2000,
                extra_headers={
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Language": "en-US,en;q=0.9",
                    "Accept-Encoding": "gzip, deflate, br",
                    "DNT": "1",
                    "Upgrade-Insecure-Requests": "1"
                }
            )

            if not video_response:
                self.logger.warning(f"Failed to load video page: {video_url}")
                return None

            details = {}

            # Extract caption/description from video page
            caption_selectors = [
                "h1[data-e2e='browse-video-desc']",
                "div[data-e2e='browse-video-desc']",
                "span[data-e2e='browse-video-desc']",
                "div.video-meta-caption",
                "div[class*='DivVideoInfoContainer'] span",
                "h1.video-meta-title",
                "meta[property='og:description']::attr(content)"
            ]

            caption = ""
            for selector in caption_selectors:
                try:
                    caption_elements = video_response.css(f"{selector}::text")
                    if caption_elements:
                        caption = ' '.join(str(elem).strip() for elem in caption_elements if elem)
                        if caption:
                            self.logger.debug(f"Found caption with selector: {selector}")
                            break
                except:
                    continue

            details['caption'] = caption

            # Try to extract additional metadata
            # Likes
            likes_elements = video_response.css("strong[data-e2e='like-count']::text")
            if likes_elements:
                details['likes'] = self._parse_count(str(likes_elements[0]))

            # Comments
            comments_elements = video_response.css("strong[data-e2e='comment-count']::text")
            if comments_elements:
                details['comments'] = self._parse_count(str(comments_elements[0]))

            # Shares
            shares_elements = video_response.css("strong[data-e2e='share-count']::text")
            if shares_elements:
                details['shares'] = self._parse_count(str(shares_elements[0]))

            # Duration
            duration_elements = video_response.css("div[class*='DivSeekBarTimeContainer'] div::text")
            if duration_elements and len(duration_elements) >= 2:
                details['duration'] = str(duration_elements[1])

            return details

        except Exception as e:
            self.logger.error(f"Error fetching video details from {video_url}: {e}")
            return None

    def _parse_count(self, count_str: str) -> int:
        """Parse TikTok view/like counts (e.g., '1.2M' -> 1200000)."""
        if not count_str:
            return 0

        count_str = str(count_str).strip().upper()

        try:
            if 'K' in count_str:
                num = re.search(r'([\d.]+)', count_str)
                if num:
                    return int(float(num.group(1)) * 1000)
            elif 'M' in count_str:
                num = re.search(r'([\d.]+)', count_str)
                if num:
                    return int(float(num.group(1)) * 1000000)
            elif 'B' in count_str:
                num = re.search(r'([\d.]+)', count_str)
                if num:
                    return int(float(num.group(1)) * 1000000000)
            else:
                # Remove any non-numeric characters
                return int(re.sub(r'[^\d]', '', count_str) or 0)
        except:
            return 0

    def fetch_content(self, max_posts: int = 20, fetch_captions: bool = False,
                     max_caption_fetches: int = 10) -> List[Dict[str, Any]]:
        """Fetch all content from TikTok with optional caption retrieval.

        Args:
            max_posts: Maximum number of posts to fetch
            fetch_captions: Whether to fetch captions from individual video pages
            max_caption_fetches: Maximum number of videos to fetch captions for
        """
        # First, get video IDs and basic info from profile
        posts_data = self.fetch_posts(max_posts=max_posts, enable_scrolling=(max_posts > 20))

        # Optionally fetch captions from individual video pages
        if fetch_captions and posts_data:
            caption_limit = min(len(posts_data), max_caption_fetches)
            self.logger.info(f"Fetching captions for {caption_limit} videos (this will take time)...")

            successful_fetches = 0
            for i, post in enumerate(posts_data[:caption_limit]):
                try:
                    # Aggressive delay before each fetch to avoid detection
                    self._human_delay(5, 10)

                    # Fetch individual video details
                    video_url = post.get('link', '')
                    if not video_url:
                        continue

                    self.logger.info(f"Fetching caption {i+1}/{caption_limit}: {video_url}")
                    video_details = self._fetch_video_details(video_url)

                    if video_details:
                        # Update post with fetched details
                        post.update(video_details)
                        successful_fetches += 1
                        self.logger.info(f"Successfully fetched caption ({successful_fetches}/{caption_limit})")

                    # Extended break every 3 videos to avoid detection
                    if (i + 1) % 3 == 0 and i < caption_limit - 1:
                        break_time = random.uniform(30, 60)
                        self.logger.info(f"Taking extended {break_time:.0f}s break to avoid detection...")
                        time.sleep(break_time)

                except Exception as e:
                    self.logger.warning(f"Failed to fetch details for video {i+1}: {e}")
                    continue

            self.logger.info(f"Caption fetching complete: {successful_fetches}/{caption_limit} successful")

        return posts_data

    def format_markdown(self, items: List[Dict[str, Any]]) -> str:
        """Format TikTok content as markdown."""
        markdown_sections = []

        for item in items:
            section = []

            # ID
            section.append(f"# ID: {item.get('id', 'N/A')}")
            section.append("")

            # Type
            section.append(f"## Type: {item.get('type', 'video')}")
            section.append("")

            # Author
            section.append(f"## Author: @{item.get('author', 'Unknown')}")
            section.append("")

            # Publish Date
            section.append(f"## Publish Date: {item.get('publish_date', '')}")
            section.append("")

            # Link
            section.append(f"## Link: {item.get('link', '')}")
            section.append("")

            # Views
            views = item.get('views', 0)
            section.append(f"## Views: {views:,}")
            section.append("")

            # Likes (if fetched from individual page)
            likes = item.get('likes')
            if likes is not None:
                section.append(f"## Likes: {likes:,}")
                section.append("")

            # Comments (if fetched from individual page)
            comments = item.get('comments')
            if comments is not None:
                section.append(f"## Comments: {comments:,}")
                section.append("")

            # Shares (if fetched from individual page)
            shares = item.get('shares')
            if shares is not None:
                section.append(f"## Shares: {shares:,}")
                section.append("")

            # Duration (if fetched from individual page)
            duration = item.get('duration')
            if duration:
                section.append(f"## Duration: {duration}")
                section.append("")

            # Caption
            section.append("## Caption:")
            caption = item.get('caption', '')
            if caption:
                section.append(caption)
            else:
                section.append("(No caption available - fetch individual video for details)")
            section.append("")

            # Separator
            section.append("-" * 50)
            section.append("")

            markdown_sections.append('\n'.join(section))

        return '\n'.join(markdown_sections)

    def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Get only new videos since last sync."""
        if not state:
            return items

        last_video_id = state.get('last_video_id')

        if not last_video_id:
            return items

        # Filter for videos newer than the last synced
        new_items = []
        for item in items:
            if item.get('id') == last_video_id:
                break  # Found the last synced video
            new_items.append(item)

        return new_items

    def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]:
        """Update state with latest video information."""
        if not items:
            return state

        # Get the first item (most recent)
        latest_item = items[0]

        state['last_video_id'] = latest_item.get('id')
        state['last_video_date'] = latest_item.get('publish_date')
        state['last_sync'] = datetime.now(self.tz).isoformat()
        state['video_count'] = len(items)

        return state