#!/usr/bin/env python3 """ TikTok scraper using TikTokApi library with Playwright. """ import os import time import random import asyncio from typing import Any, Dict, List, Optional from datetime import datetime from pathlib import Path from TikTokApi import TikTokApi from src.base_scraper import BaseScraper, ScraperConfig class TikTokScraper(BaseScraper): """TikTok scraper using TikTokApi with Playwright.""" def __init__(self, config: ScraperConfig): super().__init__(config) self.username = os.getenv('TIKTOK_USERNAME') self.password = os.getenv('TIKTOK_PASSWORD') self.target_account = os.getenv('TIKTOK_TARGET', 'hkia') # Session directory for persistence self.session_dir = self.config.data_dir / '.sessions' / 'tiktok' self.session_dir.mkdir(parents=True, exist_ok=True) # Setup API self.api = self._setup_api() # Request counter for rate limiting self.request_count = 0 self.max_requests_per_hour = 100 def _setup_api(self) -> TikTokApi: """Setup TikTokApi with conservative settings.""" # Note: In production, you'd get ms_token from browser cookies # For now, we'll let the API try to get it automatically # TikTokApi v7 has simplified parameters return TikTokApi() def _humanized_delay(self, min_seconds: float = 3, max_seconds: float = 7) -> None: """Add humanized random delay between requests.""" delay = random.uniform(min_seconds, max_seconds) self.logger.debug(f"Waiting {delay:.2f} seconds...") time.sleep(delay) def _check_rate_limit(self) -> None: """Check and enforce rate limiting.""" self.request_count += 1 if self.request_count >= self.max_requests_per_hour: self.logger.warning(f"Rate limit reached ({self.max_requests_per_hour} requests), pausing for 1 hour...") time.sleep(3600) # Wait 1 hour self.request_count = 0 elif self.request_count % 10 == 0: # Take a longer break every 10 requests self.logger.info("Taking extended break after 10 requests...") self._humanized_delay(15, 30) async def fetch_user_videos(self, max_videos: int = 20) -> List[Dict[str, Any]]: """Fetch videos from TikTok user profile.""" videos_data = [] try: self.logger.info(f"Fetching videos from @{self.target_account}") # Create sessions with Playwright async with self.api: # Try to get ms_token from environment or let API handle it ms_token = os.getenv('TIKTOK_MS_TOKEN') ms_tokens = [ms_token] if ms_token else [] await self.api.create_sessions( ms_tokens=ms_tokens, num_sessions=1, sleep_after=3, headless=True, suppress_resource_load_types=["image", "media", "font", "stylesheet"] ) # Get user object user = self.api.user(self.target_account) self._check_rate_limit() # Get videos count = 0 async for video in user.videos(count=max_videos): if count >= max_videos: break try: # Extract video data video_data = { 'id': video.id, 'author': video.author.username, 'nickname': video.author.nickname, 'description': video.desc if hasattr(video, 'desc') else '', 'publish_date': datetime.fromtimestamp(video.create_time).isoformat() if hasattr(video, 'create_time') else '', 'link': f'https://www.tiktok.com/@{video.author.username}/video/{video.id}', 'views': video.stats.play_count if hasattr(video.stats, 'play_count') else 0, 'likes': video.stats.collect_count if hasattr(video.stats, 'collect_count') else 0, 'comments': video.stats.comment_count if hasattr(video.stats, 'comment_count') else 0, 'shares': video.stats.share_count if hasattr(video.stats, 'share_count') else 0, 'duration': video.duration if hasattr(video, 'duration') else 0, 'music': video.music.title if hasattr(video, 'music') and hasattr(video.music, 'title') else '', 'hashtags': video.hashtags if hasattr(video, 'hashtags') else [] } videos_data.append(video_data) count += 1 # Rate limiting self._humanized_delay() self._check_rate_limit() # Log progress if count % 5 == 0: self.logger.info(f"Fetched {count}/{max_videos} videos") except Exception as e: self.logger.error(f"Error processing video: {e}") continue self.logger.info(f"Successfully fetched {len(videos_data)} videos") except Exception as e: self.logger.error(f"Error fetching videos: {e}") return videos_data def fetch_content(self) -> List[Dict[str, Any]]: """Synchronous wrapper for fetch_user_videos.""" # Run the async function in a new event loop try: loop = asyncio.get_event_loop() if loop.is_running(): # If there's already a running loop, create a new one in a thread import concurrent.futures with concurrent.futures.ThreadPoolExecutor() as executor: future = executor.submit(asyncio.run, self.fetch_user_videos()) return future.result() else: return loop.run_until_complete(self.fetch_user_videos()) except RuntimeError: # No event loop, create a new one return asyncio.run(self.fetch_user_videos()) def format_markdown(self, videos: List[Dict[str, Any]]) -> str: """Format TikTok videos as markdown.""" markdown_sections = [] for video in videos: section = [] # ID video_id = video.get('id', 'N/A') section.append(f"# ID: {video_id}") section.append("") # Author author = video.get('author', 'Unknown') section.append(f"## Author: {author}") section.append("") # Nickname nickname = video.get('nickname', '') if nickname: section.append(f"## Nickname: {nickname}") section.append("") # Publish Date pub_date = video.get('publish_date', '') section.append(f"## Publish Date: {pub_date}") section.append("") # Link link = video.get('link', '') section.append(f"## Link: {link}") section.append("") # Views views = video.get('views', 0) section.append(f"## Views: {views}") section.append("") # Likes likes = video.get('likes', 0) section.append(f"## Likes: {likes}") section.append("") # Comments comments = video.get('comments', 0) section.append(f"## Comments: {comments}") section.append("") # Shares shares = video.get('shares', 0) section.append(f"## Shares: {shares}") section.append("") # Duration duration = video.get('duration', 0) section.append(f"## Duration: {duration} seconds") section.append("") # Music music = video.get('music', '') if music: section.append(f"## Music: {music}") section.append("") # Hashtags hashtags = video.get('hashtags', []) if hashtags: if isinstance(hashtags[0], dict): # If hashtags are objects, extract the name hashtags_str = ', '.join([h.get('name', '') for h in hashtags if h.get('name')]) else: hashtags_str = ', '.join(hashtags) section.append(f"## Hashtags: {hashtags_str}") section.append("") # Description section.append("## Description:") description = video.get('description', '') if description: # Limit description to first 500 characters if len(description) > 500: description = description[:500] + "..." section.append(description) section.append("") # Separator section.append("-" * 50) section.append("") markdown_sections.append('\n'.join(section)) return '\n'.join(markdown_sections) def get_incremental_items(self, items: List[Dict[str, Any]], state: Dict[str, Any]) -> List[Dict[str, Any]]: """Get only new videos since last sync.""" if not state: return items last_video_id = state.get('last_video_id') if not last_video_id: return items # Filter for videos newer than the last synced new_items = [] for item in items: if item.get('id') == last_video_id: break # Found the last synced video new_items.append(item) return new_items def update_state(self, state: Dict[str, Any], items: List[Dict[str, Any]]) -> Dict[str, Any]: """Update state with latest video information.""" if not items: return state # Get the first item (most recent) latest_item = items[0] state['last_video_id'] = latest_item.get('id') state['last_video_date'] = latest_item.get('publish_date') state['last_sync'] = datetime.now(self.tz).isoformat() state['video_count'] = len(items) return state