How to Process and Convert Large Google Takeout Chrome History JSON Files to CSV

Google Takeout allows users to download their data, including Chrome browsing history, in JSON format. While this is a great way to keep track of your online activity, working with large JSON files—especially when they exceed 100MB—can be challenging. This article will guide you on how to efficiently convert these large JSON files into CSV format, depending on the file size.

Handling Files Up to 300MB: Use Our Chrome Extension

If your Google Takeout Chrome History JSON file is 300MB or smaller, our Chrome extension is the ideal tool for the job. The extension is specifically designed to manage and process large datasets by utilizing the power of Chrome APIs and IndexedDB.

Get the Chrome extension here.

Handling Files Larger Than 300MB: Use Python

For Google Takeout Chrome History JSON files larger than 300MB, we recommend using a Python script to avoid performance issues and ensure more efficient handling of the data. Python is better suited for managing very large datasets, as it can handle memory more effectively and process data faster.

Python Script for Converting JSON to CSV

Below is a Python script designed to process large Google Takeout JSON files and convert them into CSV format:

import json
import csv
from datetime import datetime
from urllib.parse import urlparse

def clean_url(url):
    return url.replace("http://", "").replace("https://", "").split("?")[0].split("#")[0]

def extract_domain(url):
    return urlparse(url).hostname

def format_local_date_time(dt):
    return dt.strftime('%Y-%m-%d %H:%M:%S')

def process_visits(visits, now):
    next_visit_to_same_url_map = {}
    next_visit_to_same_url_clean_map = {}
    next_visit_to_same_domain_map = {}
    last_visit_time = now
    distinct_days = {datetime.fromtimestamp(visit["visitTime"] / 1000).date() for visit in visits}
    total_history_days = len(distinct_days)

    csv_rows = []

    for visit in visits:
        visit_date = datetime.fromtimestamp(visit["visitTime"] / 1000)
        url = visit["url"]
        domain = extract_domain(url)
        url_clean = clean_url(url)

        hour = visit_date.hour
        day_of_week = visit_date.weekday() + 1
        is_weekend = 1 if day_of_week in [6, 7] else 0
        day_of_month = visit_date.day
        first_day = datetime(visit_date.year, visit_date.month, 1).weekday()
        week_of_month = (day_of_month + first_day) // 7 + 1
        month_of_year = visit_date.month

        next_visit_to_same_url = next_visit_to_same_url_map.get(url, now)
        next_visit_to_same_url_clean = next_visit_to_same_url_clean_map.get(url_clean, now)
        next_visit_to_same_domain = next_visit_to_same_domain_map.get(domain, now)

        seconds_until_next_visit_url = round((next_visit_to_same_url - visit["visitTime"]) / 1000)
        seconds_until_next_visit_url_clean = round((next_visit_to_same_url_clean - visit["visitTime"]) / 1000)
        seconds_until_next_visit_domain = round((next_visit_to_same_domain - visit["visitTime"]) / 1000)
        seconds_until_next_visit = round((last_visit_time - visit["visitTime"]) / 1000)

        last_visit_time = visit["visitTime"]
        next_visit_to_same_url_map[url] = visit["visitTime"]
        next_visit_to_same_url_clean_map[url_clean] = visit["visitTime"]
        next_visit_to_same_domain_map[domain] = visit["visitTime"]

        csv_row = {
            "url": url,
            "url_clean": url_clean,
            "url_domain": domain,
            "title": visit.get("title", "").replace('"', '""')[:300],
            "time": format_local_date_time(visit_date),
            "hour": hour,
            "day_of_week": day_of_week,
            "is_weekend": is_weekend,
            "day_of_month": day_of_month,
            "week_of_month": week_of_month,
            "month_of_year": month_of_year,
            "total_history_days": total_history_days,
            "seconds_until_next_visit_url": seconds_until_next_visit_url,
            "seconds_until_next_visit_url_clean": seconds_until_next_visit_url_clean,
            "seconds_until_next_visit_domain": seconds_until_next_visit_domain,
            "seconds_until_next_visit": seconds_until_next_visit,
            "page_transition": visit.get("transition", "").lower(),
            "id": visit.get("visitId", ""),
            "ref_id": visit.get("referringVisitId", ""),
            "is_local": visit.get("isLocal", ""),
            "client_id": visit.get("clientId", ""),
            "updated_at": format_local_date_time(datetime.now())
        }

        csv_rows.append(csv_row)

    return csv_rows

def main(json_file_path, output_csv_path):
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)

    data = json_data.get("Browser History", json_data)
    now = datetime.now().timestamp() * 1000  # Convert to milliseconds

    visits = []

    for item in data:
        visit_time = None
        if "time_usec" in item:
            visit_time = int(item["time_usec"]) / 1000
        elif "visitTime" in item:
            visit_time = int(item["visitTime"])

        if visit_time:
            visits.append({
                "url": item.get("url", ""),
                "title": item.get("title", ""),
                "visitTime": visit_time,
                "transition": item.get("page_transition", ""),
                "visitId": item.get("visitId", ""),
                "referringVisitId": item.get("referringVisitId", ""),
                "isLocal": item.get("isLocal", 0),
                "clientId": item.get("client_id", "")
            })

    visits.sort(key=lambda x: x["visitTime"], reverse=True)

    csv_rows = process_visits(visits, now)

    csv_headers = [
        "url", "url_clean", "url_domain", "title", "time", "hour", "day_of_week", 
        "is_weekend", "day_of_month", "week_of_month", "month_of_year", "total_history_days", 
        "seconds_until_next_visit_url", "seconds_until_next_visit_url_clean", 
        "seconds_until_next_visit_domain", "seconds_until_next_visit", 
        "page_transition", "id", "ref_id", "is_local", "client_id", "updated_at"
    ]

    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
        writer.writeheader()
        writer.writerows(csv_rows)

    print(f"CSV file created at: {output_csv_path}")

if __name__ == "__main__":
    input_json = "path/to/your/input.json"  # Replace with the path to your JSON file
    output_csv = "path/to/your/output.csv"  # Replace with the desired output CSV file path
    main(input_json, output_csv)

2024-08-28

How to Process and Convert Large Google Takeout Chrome History JSON Files to CSV

Table of Contents

Handling Files Up to 300MB: Use Our Chrome Extension

Handling Files Larger Than 300MB: Use Python

Related

Upgrade to Airbyte v2 using abtcl

Upgrade Airbyte from Docker Compose to abctl on Ubuntu

JustDataPlease Reports for GSheets : How To Generate A Github Personal Access Token

DIY VPN: How to Set Up a Free VPN (IPV4 & IPV6) on Ubuntu 18.04 or 20.04 and Windows with OpenVPN

Contact

Company

Useful Links

Newsletter