import requests
import json
from datetime import datetime, timedelta
import time
from apikey import API_KEY

def get_yesterday_articles():
    """
    Fetch all Guardian articles published yesterday
    """
    # Calculate yesterday's date
    yesterday = datetime.now() - timedelta(days=1)
    yesterday_str = yesterday.strftime('%Y-%m-%d')
    
    print(f"Fetching Guardian articles from {yesterday_str}")
    
    # API configuration
    base_url = "XXXX"
    
    # Parameters for the API request
    params = {
        'api-key': API_KEY,
        'from-date': yesterday_str,
        'to-date': yesterday_str,
        'page-size': 50,  # Maximum allowed
        'show-fields': 'headline,byline,trailText,body,wordcount,thumbnail,shortUrl,publication',
        'show-tags': 'contributor,keyword,tone,type',
        'show-elements': 'image',
        'order-by': 'newest'
    }
    
    all_articles = []
    page = 1
    total_pages = None
    
    while True:
        params['page'] = page
        
        try:
            print(f"Fetching page {page}...")
            response = requests.get(base_url, params=params)
            response.raise_for_status()  # Raise an exception for bad status codes
            
            data = response.json()
            
            if data['response']['status'] != 'ok':
                print(f"API returned error status: {data['response']['status']}")
                break
            
            # Get pagination info
            total_results = data['response']['total']
            total_pages = data['response']['pages']
            current_page = data['response']['currentPage']
            
            print(f"Page {current_page} of {total_pages} (Total articles: {total_results})")
            
            # Add articles from current page
            articles = data['response']['results']
            all_articles.extend(articles)
            
            # Check if we've reached the last page
            if current_page >= total_pages:
                break
                
            page += 1
            
            # Add a small delay to be respectful to the API
            time.sleep(0.1)
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page}: {e}")
            break
        except KeyError as e:
            print(f"Unexpected response format: {e}")
            break
    
    return all_articles

def process_and_save_articles(articles):
    """
    Process articles and save to JSON file
    """
    if not articles:
        print("No articles found.")
        return
    
    # Process each article to extract key information
    processed_articles = []
    
    for article in articles:
        processed_article = {
            'id': article.get('id'),
            'type': article.get('type'),
            'section': {
                'id': article.get('sectionId'),
                'name': article.get('sectionName')
            },
            'publication_date': article.get('webPublicationDate'),
            'title': article.get('webTitle'),
            'url': article.get('webUrl'),
            'api_url': article.get('apiUrl'),
            'pillar': {
                'id': article.get('pillarId'),
                'name': article.get('pillarName')
            }
        }
        
        # Add fields if available
        if 'fields' in article:
            fields = article['fields']
            processed_article['fields'] = {
                'headline': fields.get('headline'),
                'byline': fields.get('byline'),
                'trail_text': fields.get('trailText'),
                'body': fields.get('body'),
                'word_count': fields.get('wordcount'),
                'thumbnail': fields.get('thumbnail'),
                'short_url': fields.get('shortUrl'),
                'publication': fields.get('publication')
            }
        
        # Add tags if available
        if 'tags' in article:
            processed_article['tags'] = [
                {
                    'id': tag.get('id'),
                    'type': tag.get('type'),
                    'web_title': tag.get('webTitle'),
                    'section_id': tag.get('sectionId'),
                    'section_name': tag.get('sectionName')
                }
                for tag in article['tags']
            ]
        
        # Add elements (images) if available
        if 'elements' in article:
            processed_article['elements'] = article['elements']
        
        processed_articles.append(processed_article)
    
    # Save to JSON file with timestamp
    yesterday = datetime.now() - timedelta(days=1)
    filename = f"guardian_articles_{yesterday.strftime('%Y-%m-%d')}.json"
    
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump({
            'fetch_date': datetime.now().isoformat(),
            'target_date': yesterday.strftime('%Y-%m-%d'),
            'total_articles': len(processed_articles),
            'articles': processed_articles
        }, f, indent=2, ensure_ascii=False)
    
    print(f"\nSaved {len(processed_articles)} articles to {filename}")
    
    # Print summary statistics
    print("\n=== SUMMARY ===")
    print(f"Total articles fetched: {len(processed_articles)}")
    
    # Count by section
    sections = {}
    for article in processed_articles:
        section = article['section']['name']
        sections[section] = sections.get(section, 0) + 1
    
    print("\nArticles by section:")
    for section, count in sorted(sections.items(), key=lambda x: x[1], reverse=True):
        print(f"  {section}: {count}")
    
    # Count by type
    types = {}
    for article in processed_articles:
        article_type = article['type']
        types[article_type] = types.get(article_type, 0) + 1
    
    print("\nArticles by type:")
    for article_type, count in sorted(types.items(), key=lambda x: x[1], reverse=True):
        print(f"  {article_type}: {count}")

def main():
    """
    Main function to fetch and process Guardian articles
    """
    print("Guardian Article Fetcher")
    print("=" * 30)
    
    try:
        # Fetch articles
        articles = get_yesterday_articles()
        
        # Process and save
        process_and_save_articles(articles)
        
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()


