Dump of the stacks project

Noah Gießing, Moritz Schubotz

Published: 20 Nov 2025, Last Modified: 15 Jan 2026ZenodoEveryoneRevisionsCC BY-SA 4.0
Abstract: The dataset was created by the following python code in the context of https://github.com/MaRDI4NFDI/MaRDIRoadmap/issues/134 import pandas as pd import json from concurrent.futures import as_completed from requests_futures.sessions import FuturesSession tag_url_prefix = 'https://stacks.math.columbia.edu/data/tag/0ELT/structure' tag_structure_suffix = '/structure' tags_content_suffix = '/content/full' structures = [json.loads(requests.get(tag_url_prefix+prt+tag_structure_suffix)) for prt in parts] def tree_to_list(tree, depth=0, parents=None): result = [] if parents is None: parents = [] for node in tree: try: node_info = {'tag': node['tag'], 'name':node['name'], 'reference':node['reference'], 'type':node['type'],'depth': depth, 'parents': parents} result.append(node_info) except: node_info = {'tag': node['tag'], 'name': 'N/A', 'reference': node['reference'], 'type': node['type'], 'depth': depth, 'parents': parents} result.append(node_info) if 'children' in node: result.extend(tree_to_list(node['children'], depth + 1, [node['tag']] + parents)) return result structure_list = tree_to_list(structures) structure_df = pd.DataFrame(structure_list) with FuturesSession() as session: futures = [session.get(tag_url_prefix+structure_df.iloc[_]['tag']+'/content/full') for _ in range(100)] for future in as_completed(futures): response = future.result() d = {'content':[future.result().content for future in futures]} content_df = pd.DataFrame(d) structure_df['content'] = content_df structure_df.to_csv('~/stacks_project.csv',index=False)
Loading