Datasets:

AlgorithmicResearchGroup
/

arxiv_deep_learning_python_research_code

repo stringlengths 1 99	file stringlengths 13 215	code stringlengths 12 59.2M	file_length int64 12 59.2M	avg_line_length float64 3.82 1.48M	max_line_length int64 12 2.51M	extension_type stringclasses 1 value
TiKick	TiKick-main/setup.py	#!/usr/bin/env python # -- coding: utf-8 -- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...	1,788	35.510204	74	py
TiKick	TiKick-main/tmarl/networks/policy_network.py	import torch import torch.nn as nn from tmarl.networks.utils.util import init, check from tmarl.networks.utils.mlp import MLPBase, MLPLayer from tmarl.networks.utils.rnn import RNNLayer from tmarl.networks.utils.act import ACTLayer from tmarl.networks.utils.popart import PopArt from tmarl.utils.util import get_shape_...	5,558	41.113636	181	py
TiKick	TiKick-main/tmarl/networks/utils/distributions.py	import torch import torch.nn as nn from .util import init """ Modify standard PyTorch distributions so they are compatible with this code. """ # # Standardize distribution interfaces # # Categorical class FixedCategorical(torch.distributions.Categorical): def sample(self): return super().sample().unsque...	3,466	27.891667	86	py
TiKick	TiKick-main/tmarl/networks/utils/mlp.py	import torch.nn as nn from .util import init, get_clones class MLPLayer(nn.Module): def __init__(self, input_dim, hidden_size, layer_N, use_orthogonal, activation_id): super(MLPLayer, self).__init__() self._layer_N = layer_N active_func = [nn.Tanh(), nn.ReLU(), nn.LeakyReLU(), nn.ELU()]...	2,116	32.603175	98	py
TiKick	TiKick-main/tmarl/networks/utils/popart.py	import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F class PopArt(torch.nn.Module): def __init__(self, input_shape, output_shape, norm_axes=1, beta=0.99999, epsilon=1e-5, device=torch.device("cpu")): super(PopArt, self).__init__() self.bet...	3,796	38.968421	119	py
TiKick	TiKick-main/tmarl/networks/utils/util.py	import copy import numpy as np import torch import torch.nn as nn def init(module, weight_init, bias_init, gain=1): weight_init(module.weight.data, gain=gain) bias_init(module.bias.data) return module def get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def che...	426	21.473684	76	py
TiKick	TiKick-main/tmarl/networks/utils/act.py	from .distributions import Bernoulli, Categorical, DiagGaussian import torch import torch.nn as nn class ACTLayer(nn.Module): def __init__(self, action_space, inputs_dim, use_orthogonal, gain): super(ACTLayer, self).__init__() self.multidiscrete_action = False self.continuous_action = Fal...	7,195	46.342105	121	py
TiKick	TiKick-main/tmarl/networks/utils/rnn.py	import torch import torch.nn as nn class RNNLayer(nn.Module): def __init__(self, inputs_dim, outputs_dim, recurrent_N, use_orthogonal): super(RNNLayer, self).__init__() self._recurrent_N = recurrent_N self._use_orthogonal = use_orthogonal self.rnn = nn.GRU(inputs_dim, outputs_dim...	2,816	34.2125	132	py
TiKick	TiKick-main/tmarl/drivers/shared_distributed/base_driver.py	import numpy as np import torch def _t2n(x): return x.detach().cpu().numpy() class Driver(object): def __init__(self, config, client=None): self.all_args = config['all_args'] self.envs = config['envs'] self.eval_envs = config['eval_envs'] self.device = config['device'] ...	4,244	39.04717	126	py
TiKick	TiKick-main/tmarl/algorithms/r_mappo_distributed/mappo_algorithm.py	import torch from tmarl.utils.valuenorm import ValueNorm # implement the loss of the MAPPO here class MAPPOAlgorithm(): def __init__(self, args, init_module, device=torch.device("cpu")): self.device = device self.tpdv = dict(dtype=torch.float32, ...	2,234	38.210526	147	py
TiKick	TiKick-main/tmarl/algorithms/r_mappo_distributed/mappo_module.py	import torch from tmarl.networks.policy_network import PolicyNetwork class MAPPOModule: def __init__(self, args, obs_space, share_obs_space, act_space, device=torch.device("cpu")): self.device = device self.lr = args.lr self.critic_lr = args.critic_lr self.opti_eps = args....	1,050	41.04	135	py
TiKick	TiKick-main/tmarl/replay_buffers/normal/shared_buffer.py	import torch import numpy as np from collections import defaultdict from tmarl.utils.util import check,get_shape_from_obs_space, get_shape_from_act_space def _flatten(T, N, x): return x.reshape(T * N, x.shape[2:]) def _cast(x): return x.transpose(1, 2, 0, 3).reshape(-1, x.shape[3:]) class SharedReplayBuff...	28,769	52.081181	231	py
TiKick	TiKick-main/tmarl/configs/config.py	#!/usr/bin/env python # -- coding: utf-8 -- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...	10,665	55.734043	146	py
TiKick	TiKick-main/tmarl/runners/base_evaluator.py	#!/usr/bin/env python # -- coding: utf-8 -- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...	3,402	28.08547	97	py
TiKick	TiKick-main/tmarl/runners/base_runner.py	#!/usr/bin/env python # -- coding: utf-8 -- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...	1,079	22.478261	74	py
TiKick	TiKick-main/tmarl/utils/valuenorm.py	import numpy as np import torch import torch.nn as nn class ValueNorm(nn.Module): """ Normalize a vector of observations - across the first norm_axes dimensions""" def __init__(self, input_shape, norm_axes=1, beta=0.99999, per_element_update=False, epsilon=1e-5, device=torch.device("cpu")): super(V...	3,110	37.8875	131	py
TiKick	TiKick-main/tmarl/utils/util.py	import copy import numpy as np import math import gym import torch import torch.nn as nn import torch.nn.functional as F import torch.distributed as dist from torch.autograd import Variable from gym.spaces import Box, Discrete, Tuple def check(input): if type(input) == np.ndarray: return torch.from_numpy...	13,893	31.846336	122	py
TiKick	TiKick-main/tmarl/utils/gpu_mem_track.py	# code from https://github.com/Oldpan/Pytorch-Memory-Utils import gc import datetime import inspect import torch import numpy as np dtype_memory_size_dict = { torch.float64: 64/8, torch.double: 64/8, torch.float32: 32/8, torch.float: 32/8, torch.float16: 16/8, torch.half: 16/8, torch.int6...	4,432	36.888889	129	py
TiKick	TiKick-main/tmarl/utils/modelsize_estimate.py	# code from https://github.com/Oldpan/Pytorch-Memory-Utils import torch.nn as nn import numpy as np def modelsize(model, input, type_size=4): para = sum([np.prod(list(p.size())) for p in model.parameters()]) # print('Model {} : Number of params: {}'.format(model._get_name(), para)) print('Model {} : para...	1,428	34.725	116	py
RobDanns	RobDanns-main/deep_learning/tools/corruptions-inference-tinyimagenet.py	#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the original graph2nn github repo. # File modifications and additions by Rowan AI Lab, licensed under the Creative Commons Zero v1.0 Universal # LICENSE file in the root directory ...	25,928	41.092532	139	py
RobDanns	RobDanns-main/deep_learning/tools/train_resnet18_on_tinyimagenet200.py	#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the original graph2nn github repo. # File modifications and additions by Rowan AI Lab, licensed under the Creative Commons Zero v1.0 Universal # LICENSE file in the root directory ...	21,617	37.741935	129	py
RobDanns	RobDanns-main/deep_learning/tools/adversarial-inference-tinyimagenet200.py	#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the original graph2nn github repo. # File modifications and additions by Rowan AI Lab, licensed under the Creative Commons Zero v1.0 Universal # LICENSE file in the root directory ...	23,184	38.768439	147	py

End of preview. Expand in Data Studio

ArXiv Deep Learning Python Research Code

A curated corpus of Python source code files extracted from GitHub repositories referenced in ArXiv papers. Contains 391,496 files (1.49 GB) filtered to deep learning frameworks, designed for training and evaluating Code LLMs on research-grade code.

Dataset Summary

Statistic	Value
Total files	391,496
Total size	1.49 GB
Source repos	34,099
Time span	ArXiv inception through July 2023

Dataset Structure

Field	Type	Description
`repo`	string	GitHub repository name
`file`	string	File path in the repository
`code`	string	File contents
`file_length`	int64	Number of characters in the file
`avg_line_length`	float64	Average line length
`max_line_length`	int64	Maximum line length
`extension_type`	string	File extension

Usage

from datasets import load_dataset

# full dataset
ds = load_dataset("AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code", split="train")

# streaming
ds = load_dataset("AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code", streaming=True, split="train")
for sample in ds:
    print(sample["repo"], sample["file"])
    break

Data Collection

34,099 active GitHub repository names were extracted from ArXiv papers from its inception through July 21st, 2023, totaling 773 GB of compressed GitHub repositories.

These repositories were filtered to files mentioning any of the following frameworks: torch, jax, flax, stax, haiku, keras, fastai, xgboost, caffe, mxnet, yielding 1.4 million files which were further filtered to the final 391k.

Sensitive Information

The dataset may contain emails, IP addresses, and API/SSH keys that were previously published in public GitHub repositories.

Related Resources

ArXiv DL Instruct - Instruction-tuning dataset derived from this code
Algorithmic Research Group - Open Source

Citation

@misc{arxiv_deep_learning_python_research_code,
    title={ArXiv Deep Learning Python Research Code},
    author={Matthew Kenney},
    year={2023},
    publisher={Hugging Face},
    url={https://huggingface.co/datasets/AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code}
}

Downloads last month: 229

Models trained or fine-tuned on AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code

Amaraa0404/AI-First

Question Answering • Updated Nov 13, 2023