Google Takeout allows users to download their data, including Chrome browsing history, in JSON format. While this is a great way to keep track of your online activity, working with large JSON files—especially when they exceed 100MB—can be challenging. This article will guide you on how to efficiently convert these large JSON files into CSV format, depending on the file size.
If your Google Takeout Chrome History JSON file is 300MB or smaller, our Chrome extension is the ideal tool for the job. The extension is specifically designed to manage and process large datasets by utilizing the power of Chrome APIs and IndexedDB.
Get the Chrome extension here.
For Google Takeout Chrome History JSON files larger than 300MB, we recommend using a Python script to avoid performance issues and ensure more efficient handling of the data. Python is better suited for managing very large datasets, as it can handle memory more effectively and process data faster.
Python Script for Converting JSON to CSV
Below is a Python script designed to process large Google Takeout JSON files and convert them into CSV format:
import json
import csv
from datetime import datetime
from urllib.parse import urlparse
def clean_url(url):
return url.replace("http://", "").replace("https://", "").split("?")[0].split("#")[0]
def extract_domain(url):
return urlparse(url).hostname
def format_local_date_time(dt):
return dt.strftime('%Y-%m-%d %H:%M:%S')
def process_visits(visits, now):
next_visit_to_same_url_map = {}
next_visit_to_same_url_clean_map = {}
next_visit_to_same_domain_map = {}
last_visit_time = now
distinct_days = {datetime.fromtimestamp(visit["visitTime"] / 1000).date() for visit in visits}
total_history_days = len(distinct_days)
csv_rows = []
for visit in visits:
visit_date = datetime.fromtimestamp(visit["visitTime"] / 1000)
url = visit["url"]
domain = extract_domain(url)
url_clean = clean_url(url)
hour = visit_date.hour
day_of_week = visit_date.weekday() + 1
is_weekend = 1 if day_of_week in [6, 7] else 0
day_of_month = visit_date.day
first_day = datetime(visit_date.year, visit_date.month, 1).weekday()
week_of_month = (day_of_month + first_day) // 7 + 1
month_of_year = visit_date.month
next_visit_to_same_url = next_visit_to_same_url_map.get(url, now)
next_visit_to_same_url_clean = next_visit_to_same_url_clean_map.get(url_clean, now)
next_visit_to_same_domain = next_visit_to_same_domain_map.get(domain, now)
seconds_until_next_visit_url = round((next_visit_to_same_url - visit["visitTime"]) / 1000)
seconds_until_next_visit_url_clean = round((next_visit_to_same_url_clean - visit["visitTime"]) / 1000)
seconds_until_next_visit_domain = round((next_visit_to_same_domain - visit["visitTime"]) / 1000)
seconds_until_next_visit = round((last_visit_time - visit["visitTime"]) / 1000)
last_visit_time = visit["visitTime"]
next_visit_to_same_url_map[url] = visit["visitTime"]
next_visit_to_same_url_clean_map[url_clean] = visit["visitTime"]
next_visit_to_same_domain_map[domain] = visit["visitTime"]
csv_row = {
"url": url,
"url_clean": url_clean,
"url_domain": domain,
"title": visit.get("title", "").replace('"', '""')[:300],
"time": format_local_date_time(visit_date),
"hour": hour,
"day_of_week": day_of_week,
"is_weekend": is_weekend,
"day_of_month": day_of_month,
"week_of_month": week_of_month,
"month_of_year": month_of_year,
"total_history_days": total_history_days,
"seconds_until_next_visit_url": seconds_until_next_visit_url,
"seconds_until_next_visit_url_clean": seconds_until_next_visit_url_clean,
"seconds_until_next_visit_domain": seconds_until_next_visit_domain,
"seconds_until_next_visit": seconds_until_next_visit,
"page_transition": visit.get("transition", "").lower(),
"id": visit.get("visitId", ""),
"ref_id": visit.get("referringVisitId", ""),
"is_local": visit.get("isLocal", ""),
"client_id": visit.get("clientId", ""),
"updated_at": format_local_date_time(datetime.now())
}
csv_rows.append(csv_row)
return csv_rows
def main(json_file_path, output_csv_path):
with open(json_file_path, 'r') as file:
json_data = json.load(file)
data = json_data.get("Browser History", json_data)
now = datetime.now().timestamp() * 1000 # Convert to milliseconds
visits = []
for item in data:
visit_time = None
if "time_usec" in item:
visit_time = int(item["time_usec"]) / 1000
elif "visitTime" in item:
visit_time = int(item["visitTime"])
if visit_time:
visits.append({
"url": item.get("url", ""),
"title": item.get("title", ""),
"visitTime": visit_time,
"transition": item.get("page_transition", ""),
"visitId": item.get("visitId", ""),
"referringVisitId": item.get("referringVisitId", ""),
"isLocal": item.get("isLocal", 0),
"clientId": item.get("client_id", "")
})
visits.sort(key=lambda x: x["visitTime"], reverse=True)
csv_rows = process_visits(visits, now)
csv_headers = [
"url", "url_clean", "url_domain", "title", "time", "hour", "day_of_week",
"is_weekend", "day_of_month", "week_of_month", "month_of_year", "total_history_days",
"seconds_until_next_visit_url", "seconds_until_next_visit_url_clean",
"seconds_until_next_visit_domain", "seconds_until_next_visit",
"page_transition", "id", "ref_id", "is_local", "client_id", "updated_at"
]
with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
writer.writeheader()
writer.writerows(csv_rows)
print(f"CSV file created at: {output_csv_path}")
if __name__ == "__main__":
input_json = "path/to/your/input.json" # Replace with the path to your JSON file
output_csv = "path/to/your/output.csv" # Replace with the desired output CSV file path
main(input_json, output_csv)