Dataset Viewer
Auto-converted to Parquet Duplicate
repo
stringlengths
1
99
file
stringlengths
13
215
code
stringlengths
12
59.2M
file_length
int64
12
59.2M
avg_line_length
float64
3.82
1.48M
max_line_length
int64
12
2.51M
extension_type
stringclasses
1 value
TiKick
TiKick-main/setup.py
#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...
1,788
35.510204
74
py
TiKick
TiKick-main/tmarl/networks/policy_network.py
import torch import torch.nn as nn from tmarl.networks.utils.util import init, check from tmarl.networks.utils.mlp import MLPBase, MLPLayer from tmarl.networks.utils.rnn import RNNLayer from tmarl.networks.utils.act import ACTLayer from tmarl.networks.utils.popart import PopArt from tmarl.utils.util import get_shape_...
5,558
41.113636
181
py
TiKick
TiKick-main/tmarl/networks/utils/distributions.py
import torch import torch.nn as nn from .util import init """ Modify standard PyTorch distributions so they are compatible with this code. """ # # Standardize distribution interfaces # # Categorical class FixedCategorical(torch.distributions.Categorical): def sample(self): return super().sample().unsque...
3,466
27.891667
86
py
TiKick
TiKick-main/tmarl/networks/utils/mlp.py
import torch.nn as nn from .util import init, get_clones class MLPLayer(nn.Module): def __init__(self, input_dim, hidden_size, layer_N, use_orthogonal, activation_id): super(MLPLayer, self).__init__() self._layer_N = layer_N active_func = [nn.Tanh(), nn.ReLU(), nn.LeakyReLU(), nn.ELU()]...
2,116
32.603175
98
py
TiKick
TiKick-main/tmarl/networks/utils/popart.py
import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F class PopArt(torch.nn.Module): def __init__(self, input_shape, output_shape, norm_axes=1, beta=0.99999, epsilon=1e-5, device=torch.device("cpu")): super(PopArt, self).__init__() self.bet...
3,796
38.968421
119
py
TiKick
TiKick-main/tmarl/networks/utils/util.py
import copy import numpy as np import torch import torch.nn as nn def init(module, weight_init, bias_init, gain=1): weight_init(module.weight.data, gain=gain) bias_init(module.bias.data) return module def get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) def che...
426
21.473684
76
py
TiKick
TiKick-main/tmarl/networks/utils/act.py
from .distributions import Bernoulli, Categorical, DiagGaussian import torch import torch.nn as nn class ACTLayer(nn.Module): def __init__(self, action_space, inputs_dim, use_orthogonal, gain): super(ACTLayer, self).__init__() self.multidiscrete_action = False self.continuous_action = Fal...
7,195
46.342105
121
py
TiKick
TiKick-main/tmarl/networks/utils/rnn.py
import torch import torch.nn as nn class RNNLayer(nn.Module): def __init__(self, inputs_dim, outputs_dim, recurrent_N, use_orthogonal): super(RNNLayer, self).__init__() self._recurrent_N = recurrent_N self._use_orthogonal = use_orthogonal self.rnn = nn.GRU(inputs_dim, outputs_dim...
2,816
34.2125
132
py
TiKick
TiKick-main/tmarl/drivers/shared_distributed/base_driver.py
import numpy as np import torch def _t2n(x): return x.detach().cpu().numpy() class Driver(object): def __init__(self, config, client=None): self.all_args = config['all_args'] self.envs = config['envs'] self.eval_envs = config['eval_envs'] self.device = config['device'] ...
4,244
39.04717
126
py
TiKick
TiKick-main/tmarl/algorithms/r_mappo_distributed/mappo_algorithm.py
import torch from tmarl.utils.valuenorm import ValueNorm # implement the loss of the MAPPO here class MAPPOAlgorithm(): def __init__(self, args, init_module, device=torch.device("cpu")): self.device = device self.tpdv = dict(dtype=torch.float32, ...
2,234
38.210526
147
py
TiKick
TiKick-main/tmarl/algorithms/r_mappo_distributed/mappo_module.py
import torch from tmarl.networks.policy_network import PolicyNetwork class MAPPOModule: def __init__(self, args, obs_space, share_obs_space, act_space, device=torch.device("cpu")): self.device = device self.lr = args.lr self.critic_lr = args.critic_lr self.opti_eps = args....
1,050
41.04
135
py
TiKick
TiKick-main/tmarl/replay_buffers/normal/shared_buffer.py
import torch import numpy as np from collections import defaultdict from tmarl.utils.util import check,get_shape_from_obs_space, get_shape_from_act_space def _flatten(T, N, x): return x.reshape(T * N, *x.shape[2:]) def _cast(x): return x.transpose(1, 2, 0, 3).reshape(-1, *x.shape[3:]) class SharedReplayBuff...
28,769
52.081181
231
py
TiKick
TiKick-main/tmarl/configs/config.py
#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...
10,665
55.734043
146
py
TiKick
TiKick-main/tmarl/runners/base_evaluator.py
#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...
3,402
28.08547
97
py
TiKick
TiKick-main/tmarl/runners/base_runner.py
#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright 2021 The TARTRL Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unle...
1,079
22.478261
74
py
TiKick
TiKick-main/tmarl/utils/valuenorm.py
import numpy as np import torch import torch.nn as nn class ValueNorm(nn.Module): """ Normalize a vector of observations - across the first norm_axes dimensions""" def __init__(self, input_shape, norm_axes=1, beta=0.99999, per_element_update=False, epsilon=1e-5, device=torch.device("cpu")): super(V...
3,110
37.8875
131
py
TiKick
TiKick-main/tmarl/utils/util.py
import copy import numpy as np import math import gym import torch import torch.nn as nn import torch.nn.functional as F import torch.distributed as dist from torch.autograd import Variable from gym.spaces import Box, Discrete, Tuple def check(input): if type(input) == np.ndarray: return torch.from_numpy...
13,893
31.846336
122
py
TiKick
TiKick-main/tmarl/utils/gpu_mem_track.py
# code from https://github.com/Oldpan/Pytorch-Memory-Utils import gc import datetime import inspect import torch import numpy as np dtype_memory_size_dict = { torch.float64: 64/8, torch.double: 64/8, torch.float32: 32/8, torch.float: 32/8, torch.float16: 16/8, torch.half: 16/8, torch.int6...
4,432
36.888889
129
py
TiKick
TiKick-main/tmarl/utils/modelsize_estimate.py
# code from https://github.com/Oldpan/Pytorch-Memory-Utils import torch.nn as nn import numpy as np def modelsize(model, input, type_size=4): para = sum([np.prod(list(p.size())) for p in model.parameters()]) # print('Model {} : Number of params: {}'.format(model._get_name(), para)) print('Model {} : para...
1,428
34.725
116
py
RobDanns
RobDanns-main/deep_learning/tools/corruptions-inference-tinyimagenet.py
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the original graph2nn github repo. # File modifications and additions by Rowan AI Lab, licensed under the Creative Commons Zero v1.0 Universal # LICENSE file in the root directory ...
25,928
41.092532
139
py
RobDanns
RobDanns-main/deep_learning/tools/train_resnet18_on_tinyimagenet200.py
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the original graph2nn github repo. # File modifications and additions by Rowan AI Lab, licensed under the Creative Commons Zero v1.0 Universal # LICENSE file in the root directory ...
21,617
37.741935
129
py
RobDanns
RobDanns-main/deep_learning/tools/adversarial-inference-tinyimagenet200.py
#!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the original graph2nn github repo. # File modifications and additions by Rowan AI Lab, licensed under the Creative Commons Zero v1.0 Universal # LICENSE file in the root directory ...
23,184
38.768439
147
py
End of preview. Expand in Data Studio

ArXiv Deep Learning Python Research Code

A curated corpus of Python source code files extracted from GitHub repositories referenced in ArXiv papers. Contains 391,496 files (1.49 GB) filtered to deep learning frameworks, designed for training and evaluating Code LLMs on research-grade code.

Dataset Summary

Statistic Value
Total files 391,496
Total size 1.49 GB
Source repos 34,099
Time span ArXiv inception through July 2023

Dataset Structure

Field Type Description
repo string GitHub repository name
file string File path in the repository
code string File contents
file_length int64 Number of characters in the file
avg_line_length float64 Average line length
max_line_length int64 Maximum line length
extension_type string File extension

Usage

from datasets import load_dataset

# full dataset
ds = load_dataset("AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code", split="train")

# streaming
ds = load_dataset("AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code", streaming=True, split="train")
for sample in ds:
    print(sample["repo"], sample["file"])
    break

Data Collection

34,099 active GitHub repository names were extracted from ArXiv papers from its inception through July 21st, 2023, totaling 773 GB of compressed GitHub repositories.

These repositories were filtered to files mentioning any of the following frameworks: torch, jax, flax, stax, haiku, keras, fastai, xgboost, caffe, mxnet, yielding 1.4 million files which were further filtered to the final 391k.

Sensitive Information

The dataset may contain emails, IP addresses, and API/SSH keys that were previously published in public GitHub repositories.

Related Resources

Citation

@misc{arxiv_deep_learning_python_research_code,
    title={ArXiv Deep Learning Python Research Code},
    author={Matthew Kenney},
    year={2023},
    publisher={Hugging Face},
    url={https://huggingface.co/datasets/AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code}
}
Downloads last month
229

Models trained or fine-tuned on AlgorithmicResearchGroup/arxiv_deep_learning_python_research_code