import json
import pandas as pd
from datetime import datetime
import time
import argparse

def convert_yelp_to_movielens_format(review_json_path, output_csv_path):
    """
    将 Yelp 的 review.json 转换为 MovieLens 格式的 CSV 文件。
    
    参数：
      review_json_path: Yelp 的 review.json 文件路径
      output_csv_path: 输出的 CSV 文件路径
    """
    data = []
    user_id_to_int = {}
    business_id_to_int = {}
    next_user_id = 1
    next_business_id = 1
    
    with open(review_json_path, 'r', encoding='utf-8') as f:
        for line in f:
            review = json.loads(line.strip())
            
            user_id = review['user_id']
            business_id = review['business_id']
            rating = review['stars']
            date_str = review['date']
            
            if user_id not in user_id_to_int:
                user_id_to_int[user_id] = next_user_id
                next_user_id += 1
            user_int = user_id_to_int[user_id]
            
            if business_id not in business_id_to_int:
                business_id_to_int[business_id] = next_business_id
                next_business_id += 1
            business_int = business_id_to_int[business_id]
            
            # 尝试解析日期（支持 YYYY-MM-DD HH:MM:SS 和 YYYY-MM-DD 两种格式）
            try:
                # 首先尝试解析 YYYY-MM-DD HH:MM:SS 格式
                date_obj = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                # 如果失败，尝试解析 YYYY-MM-DD 格式
                date_obj = datetime.strptime(date_str, '%Y-%m-%d')
            
            # 转换为 Unix 时间戳
            timestamp = int(time.mktime(date_obj.timetuple()))
            
            data.append({
                'userId': user_int,
                'movieId': business_int,
                'rating': float(rating),
                'timestamp': timestamp
            })
    
    df = pd.DataFrame(data)
    df = df.sort_values(by=['userId', 'timestamp'])
    df.to_csv(output_csv_path, index=False)
    print(f"已将 Yelp 数据转换为 MovieLens 格式，保存到 {output_csv_path}")
    print(f"总用户数: {next_user_id - 1}")
    print(f"总商家数: {next_business_id - 1}")
    print(f"总评论数: {len(df)}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Convert Yelp review.json to MovieLens format CSV.")
    parser.add_argument('--input', type=str, default='data/yelp_academic_dataset_review.json',
                        help='Path to Yelp review.json file')
    parser.add_argument('--output', type=str, default='data/yelp.csv',
                        help='Path to output CSV file')
    args = parser.parse_args()

    convert_yelp_to_movielens_format(args.input, args.output)