:gitee_url: https://gitee.com/mindspore/docs .. _program_listing_file_include_datasets.h: Program Listing for File datasets.h =================================== |exhale_lsh| :ref:`Return to documentation for file ` (``include/datasets.h``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_DATASETS_H_ #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_DATASETS_H_ #include #include #include #include #include #include #include #include #include #include #include #include #include "include/api/dual_abi_helper.h" #include "include/api/types.h" #include "include/dataset/iterator.h" #include "include/dataset/json_fwd.hpp" #include "include/dataset/samplers.h" #include "include/dataset/text.h" namespace mindspore { namespace dataset { class Tensor; class TensorShape; class TreeAdapter; class TreeAdapterLite; class TreeGetters; class Vocab; class DatasetCache; class DatasetNode; class Iterator; class PullBasedIterator; class TensorOperation; class SchemaObj; class SamplerObj; class CsvBase; // Dataset classes (in alphabetical order) class BatchDataset; class MapDataset; class ProjectDataset; class ShuffleDataset; class BucketBatchByLengthDataset; class FilterDataset; class CSVDataset; class TransferDataset; class ConcatDataset; class RenameDataset; class SentencePieceVocab; enum class SentencePieceModel; class DSCallback; class RepeatDataset; class SkipDataset; class TakeDataset; class ZipDataset; class Dataset : public std::enable_shared_from_this { public: // need friend class so they can access the children_ field friend class Iterator; friend class TransferNode; Dataset(); ~Dataset() = default; int64_t GetDatasetSize(bool estimate = false); std::vector GetOutputTypes(); std::vector> GetOutputShapes(); int64_t GetBatchSize(); int64_t GetRepeatCount(); int64_t GetNumClasses(); std::vector GetColumnNames() { return VectorCharToString(GetColumnNamesCharIF()); } std::vector>> GetClassIndexing() { return ClassIndexCharToString(GetClassIndexingCharIF()); } std::shared_ptr SetNumWorkers(int32_t num_workers); std::shared_ptr CreatePullBasedIterator(std::vector> columns = {}); std::shared_ptr CreateIterator(std::vector columns = {}, int32_t num_epochs = -1) { return CreateIteratorCharIF(VectorStringToChar(columns), num_epochs); } bool DeviceQueue(std::string queue_name = "", std::string device_type = "", int32_t device_id = 0, int32_t num_epochs = -1, bool send_epoch_end = true, int32_t total_batches = 0, bool create_data_info_queue = false) { return DeviceQueueCharIF(StringToChar(queue_name), StringToChar(device_type), device_id, num_epochs, send_epoch_end, total_batches, create_data_info_queue); } bool Save(std::string dataset_path, int32_t num_files = 1, std::string dataset_type = "mindrecord") { return SaveCharIF(StringToChar(dataset_path), num_files, StringToChar(dataset_type)); } std::shared_ptr Batch(int32_t batch_size, bool drop_remainder = false); std::shared_ptr BucketBatchByLength( const std::vector &column_names, const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, std::function element_length_function = nullptr, const std::map, MSTensor>> &pad_info = {}, bool pad_to_bucket_boundary = false, bool drop_remainder = false) { return std::make_shared( shared_from_this(), VectorStringToChar(column_names), bucket_boundaries, bucket_batch_sizes, element_length_function, PadInfoStringToChar(pad_info), pad_to_bucket_boundary, drop_remainder); } std::shared_ptr BuildSentencePieceVocab( const std::vector &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::unordered_map ¶ms) { return BuildSentencePieceVocabCharIF(VectorStringToChar(col_names), vocab_size, character_coverage, model_type, UnorderedMapStringToChar(params)); } std::shared_ptr BuildVocab(const std::vector &columns = {}, const std::pair &freq_range = {0, kDeMaxFreq}, int64_t top_k = kDeMaxTopk, const std::vector &special_tokens = {}, bool special_first = true) { return BuildVocabCharIF(VectorStringToChar(columns), freq_range, top_k, VectorStringToChar(special_tokens), special_first); } std::shared_ptr Concat(const std::vector> &datasets) { std::vector> all_datasets{shared_from_this()}; all_datasets.insert(std::end(all_datasets), std::begin(datasets), std::end(datasets)); return std::make_shared(all_datasets); } std::shared_ptr Filter(std::function predicate, const std::vector &input_columns = {}) { return std::make_shared(shared_from_this(), predicate, VectorStringToChar(input_columns)); } std::shared_ptr Map(std::vector operations, const std::vector &input_columns = {}, const std::vector &output_columns = {}, const std::vector &project_columns = {}, const std::shared_ptr &cache = nullptr, std::vector> callbacks = {}) { std::vector> transform_ops; (void)std::transform( operations.begin(), operations.end(), std::back_inserter(transform_ops), [](TensorTransform *op) -> std::shared_ptr { return op != nullptr ? op->Parse() : nullptr; }); return std::make_shared(shared_from_this(), transform_ops, VectorStringToChar(input_columns), VectorStringToChar(output_columns), VectorStringToChar(project_columns), cache, callbacks); } std::shared_ptr Map(std::vector> operations, const std::vector &input_columns = {}, const std::vector &output_columns = {}, const std::vector &project_columns = {}, const std::shared_ptr &cache = nullptr, std::vector> callbacks = {}) { std::vector> transform_ops; (void)std::transform(operations.begin(), operations.end(), std::back_inserter(transform_ops), [](std::shared_ptr op) -> std::shared_ptr { return op != nullptr ? op->Parse() : nullptr; }); return std::make_shared(shared_from_this(), transform_ops, VectorStringToChar(input_columns), VectorStringToChar(output_columns), VectorStringToChar(project_columns), cache, callbacks); } std::shared_ptr Map(const std::vector> operations, const std::vector &input_columns = {}, const std::vector &output_columns = {}, const std::vector &project_columns = {}, const std::shared_ptr &cache = nullptr, std::vector> callbacks = {}) { std::vector> transform_ops; (void)std::transform(operations.begin(), operations.end(), std::back_inserter(transform_ops), [](TensorTransform &op) -> std::shared_ptr { return op.Parse(); }); return std::make_shared(shared_from_this(), transform_ops, VectorStringToChar(input_columns), VectorStringToChar(output_columns), VectorStringToChar(project_columns), cache, callbacks); } std::shared_ptr Project(const std::vector &columns) { return std::make_shared(shared_from_this(), VectorStringToChar(columns)); } std::shared_ptr Rename(const std::vector &input_columns, const std::vector &output_columns) { return std::make_shared(shared_from_this(), VectorStringToChar(input_columns), VectorStringToChar(output_columns)); } std::shared_ptr Repeat(int32_t count = -1) { return std::make_shared(shared_from_this(), count); } std::shared_ptr Shuffle(int32_t buffer_size) { return std::make_shared(shared_from_this(), buffer_size); } std::shared_ptr Skip(int32_t count) { return std::make_shared(shared_from_this(), count); } std::shared_ptr Take(int32_t count = -1) { return std::make_shared(shared_from_this(), count); } std::shared_ptr Zip(const std::vector> &datasets) { std::vector> all_datasets = datasets; all_datasets.push_back(shared_from_this()); return std::make_shared(all_datasets); } std::shared_ptr IRNode() { return ir_node_; } protected: std::shared_ptr tree_getters_; std::shared_ptr ir_node_; private: // Char interface(CharIF) of GetColumnNames std::vector> GetColumnNamesCharIF(); // Char interface(CharIF) of GetClassIndexing std::vector, std::vector>> GetClassIndexingCharIF(); // Char interface(CharIF) of CreateIterator std::shared_ptr CreateIteratorCharIF(std::vector> columns, int32_t num_epochs); // Char interface(CharIF) of DeviceQueue bool DeviceQueueCharIF(const std::vector &queue_name, const std::vector &device_type, int32_t device_id, int32_t num_epochs, bool send_epoch_end, int32_t total_batches, bool create_data_info_queue); // Char interface(CharIF) of Save bool SaveCharIF(const std::vector &dataset_path, int32_t num_files, const std::vector &dataset_type); // Char interface(CharIF) of BuildSentencePieceVocab std::shared_ptr BuildSentencePieceVocabCharIF( const std::vector> &col_names, int32_t vocab_size, float character_coverage, SentencePieceModel model_type, const std::map, std::vector> ¶ms); // Char interface(CharIF) of BuildVocab std::shared_ptr BuildVocabCharIF(const std::vector> &columns, const std::pair &freq_range, int64_t top_k, const std::vector> &special_tokens, bool special_first); }; class SchemaObj { public: explicit SchemaObj(const std::string &schema_file = "") : SchemaObj(StringToChar(schema_file)) {} ~SchemaObj() = default; Status Init(); Status add_column(const std::string &name, mindspore::DataType ms_type) { return add_column_char(StringToChar(name), ms_type); } Status add_column(const std::string &name, const std::string &ms_type) { return add_column_char(StringToChar(name), StringToChar(ms_type)); } Status add_column(const std::string &name, mindspore::DataType ms_type, const std::vector &shape) { return add_column_char(StringToChar(name), ms_type, shape); } Status add_column(const std::string &name, const std::string &ms_type, const std::vector &shape) { return add_column_char(StringToChar(name), StringToChar(ms_type), shape); } std::string to_json() { return CharToString(to_json_char()); } std::string to_string() { return to_json(); } void set_dataset_type(std::string dataset_type); void set_num_rows(int32_t num_rows); int32_t get_num_rows() const; Status FromJSONString(const std::string &json_string) { return FromJSONStringCharIF(StringToChar(json_string)); } Status ParseColumnString(const std::string &json_string) { return ParseColumnStringCharIF(StringToChar(json_string)); } private: Status parse_column(nlohmann::json columns); Status from_json(nlohmann::json json_obj); // Char constructor of SchemaObj explicit SchemaObj(const std::vector &schema_file); // Char interface of add_column Status add_column_char(const std::vector &name, mindspore::DataType ms_type); Status add_column_char(const std::vector &name, const std::vector &ms_type); Status add_column_char(const std::vector &name, mindspore::DataType ms_type, const std::vector &shape); Status add_column_char(const std::vector &name, const std::vector &ms_type, const std::vector &shape); // Char interface of to_json const std::vector to_json_char(); // Char interface of FromJSONString Status FromJSONStringCharIF(const std::vector &json_string); // Char interface of ParseColumnString Status ParseColumnStringCharIF(const std::vector &json_string); struct Data; std::shared_ptr data_; }; class BatchDataset : public Dataset { public: BatchDataset(std::shared_ptr input, int32_t batch_size, bool drop_remainder = false); ~BatchDataset() = default; }; class BucketBatchByLengthDataset : public Dataset { public: BucketBatchByLengthDataset( std::shared_ptr input, const std::vector> &column_names, const std::vector &bucket_boundaries, const std::vector &bucket_batch_sizes, std::function element_length_function = nullptr, const std::map, std::pair, MSTensor>> &pad_info = {}, bool pad_to_bucket_boundary = false, bool drop_remainder = false); ~BucketBatchByLengthDataset() = default; }; class ConcatDataset : public Dataset { public: explicit ConcatDataset(const std::vector> &input); ~ConcatDataset() = default; }; class FilterDataset : public Dataset { public: FilterDataset(std::shared_ptr input, std::function predicate, const std::vector> &input_columns); ~FilterDataset() = default; }; class MapDataset : public Dataset { public: MapDataset(std::shared_ptr input, std::vector> operations, const std::vector> &input_columns, const std::vector> &output_columns, const std::vector> &project_columns, const std::shared_ptr &cache, std::vector> callbacks); ~MapDataset() = default; }; class ProjectDataset : public Dataset { public: ProjectDataset(std::shared_ptr input, const std::vector> &columns); ~ProjectDataset() = default; }; class RenameDataset : public Dataset { public: RenameDataset(std::shared_ptr input, const std::vector> &input_columns, const std::vector> &output_columns); ~RenameDataset() = default; }; class RepeatDataset : public Dataset { public: RepeatDataset(std::shared_ptr input, int32_t count); ~RepeatDataset() = default; }; class ShuffleDataset : public Dataset { public: ShuffleDataset(std::shared_ptr input, int32_t buffer_size); ~ShuffleDataset() = default; }; class SkipDataset : public Dataset { public: SkipDataset(std::shared_ptr input, int32_t count); ~SkipDataset() = default; }; class TakeDataset : public Dataset { public: TakeDataset(std::shared_ptr input, int32_t count); ~TakeDataset() = default; }; class ZipDataset : public Dataset { public: explicit ZipDataset(const std::vector> &inputs); ~ZipDataset() = default; }; std::shared_ptr SchemaCharIF(const std::vector &schema_file); inline std::shared_ptr Schema(const std::string &schema_file = "") { return SchemaCharIF(StringToChar(schema_file)); } class AlbumDataset : public Dataset { public: AlbumDataset(const std::vector &dataset_dir, const std::vector &data_schema, const std::vector> &column_names, bool decode, const std::shared_ptr &sampler, const std::shared_ptr &cache); AlbumDataset(const std::vector &dataset_dir, const std::vector &data_schema, const std::vector> &column_names, bool decode, const Sampler *sampler, const std::shared_ptr &cache); AlbumDataset(const std::vector &dataset_dir, const std::vector &data_schema, const std::vector> &column_names, bool decode, const std::reference_wrapper sampler, const std::shared_ptr &cache); ~AlbumDataset() = default; }; inline std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector &column_names = {}, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(data_schema), VectorStringToChar(column_names), decode, sampler, cache); } inline std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector &column_names, bool decode, const Sampler *sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(data_schema), VectorStringToChar(column_names), decode, sampler, cache); } inline std::shared_ptr Album(const std::string &dataset_dir, const std::string &data_schema, const std::vector &column_names, bool decode, const std::reference_wrapper sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(data_schema), VectorStringToChar(column_names), decode, sampler, cache); } class CelebADataset : public Dataset { public: explicit CelebADataset(const std::vector &dataset_dir, const std::vector &usage, const std::shared_ptr &sampler, bool decode, const std::set> &extensions, const std::shared_ptr &cache); explicit CelebADataset(const std::vector &dataset_dir, const std::vector &usage, const Sampler *sampler, bool decode, const std::set> &extensions, const std::shared_ptr &cache); explicit CelebADataset(const std::vector &dataset_dir, const std::vector &usage, const std::reference_wrapper sampler, bool decode, const std::set> &extensions, const std::shared_ptr &cache); ~CelebADataset() = default; }; inline std::shared_ptr CelebA( const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), bool decode = false, const std::set &extensions = {}, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, decode, SetStringToChar(extensions), cache); } inline std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &usage, const Sampler *sampler, bool decode = false, const std::set &extensions = {}, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, decode, SetStringToChar(extensions), cache); } inline std::shared_ptr CelebA(const std::string &dataset_dir, const std::string &usage, const std::reference_wrapper sampler, bool decode = false, const std::set &extensions = {}, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, decode, SetStringToChar(extensions), cache); } class Cifar10Dataset : public Dataset { public: explicit Cifar10Dataset(const std::vector &dataset_dir, const std::vector &usage, const std::shared_ptr &sampler, const std::shared_ptr &cache); explicit Cifar10Dataset(const std::vector &dataset_dir, const std::vector &usage, const Sampler *sampler, const std::shared_ptr &cache); explicit Cifar10Dataset(const std::vector &dataset_dir, const std::vector &usage, const std::reference_wrapper sampler, const std::shared_ptr &cache); ~Cifar10Dataset() = default; }; inline std::shared_ptr Cifar10( const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr Cifar10(const std::string &dataset_dir, const std::string &usage, const Sampler *sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr Cifar10(const std::string &dataset_dir, const std::string &usage, const std::reference_wrapper sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } class Cifar100Dataset : public Dataset { public: explicit Cifar100Dataset(const std::vector &dataset_dir, const std::vector &usage, const std::shared_ptr &sampler, const std::shared_ptr &cache); explicit Cifar100Dataset(const std::vector &dataset_dir, const std::vector &usage, const Sampler *sampler, const std::shared_ptr &cache); explicit Cifar100Dataset(const std::vector &dataset_dir, const std::vector &usage, const std::reference_wrapper sampler, const std::shared_ptr &cache); ~Cifar100Dataset() = default; }; inline std::shared_ptr Cifar100( const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr Cifar100(const std::string &dataset_dir, const std::string &usage, const Sampler *sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr Cifar100(const std::string &dataset_dir, const std::string &usage, const std::reference_wrapper sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } class CLUEDataset : public Dataset { public: explicit CLUEDataset(const std::vector> &dataset_files, const std::vector &task, const std::vector &usage, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache); ~CLUEDataset() = default; }; inline std::shared_ptr CLUE(const std::vector &dataset_files, const std::string &task = "AFQMC", const std::string &usage = "train", int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, const std::shared_ptr &cache = nullptr) { return std::make_shared(VectorStringToChar(dataset_files), StringToChar(task), StringToChar(usage), num_samples, shuffle, num_shards, shard_id, cache); } class CocoDataset : public Dataset { public: CocoDataset(const std::vector &dataset_dir, const std::vector &annotation_file, const std::vector &task, const bool &decode, const std::shared_ptr &sampler, const std::shared_ptr &cache, const bool &extra_metadata); CocoDataset(const std::vector &dataset_dir, const std::vector &annotation_file, const std::vector &task, const bool &decode, const Sampler *sampler, const std::shared_ptr &cache, const bool &extra_metadata); CocoDataset(const std::vector &dataset_dir, const std::vector &annotation_file, const std::vector &task, const bool &decode, const std::reference_wrapper sampler, const std::shared_ptr &cache, const bool &extra_metadata); ~CocoDataset() = default; }; inline std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task = "Detection", const bool &decode = false, const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr, const bool &extra_metadata = false) { return std::make_shared(StringToChar(dataset_dir), StringToChar(annotation_file), StringToChar(task), decode, sampler, cache, extra_metadata); } inline std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, const bool &decode, const Sampler *sampler, const std::shared_ptr &cache = nullptr, const bool &extra_metadata = false) { return std::make_shared(StringToChar(dataset_dir), StringToChar(annotation_file), StringToChar(task), decode, sampler, cache, extra_metadata); } inline std::shared_ptr Coco(const std::string &dataset_dir, const std::string &annotation_file, const std::string &task, const bool &decode, const std::reference_wrapper sampler, const std::shared_ptr &cache = nullptr, const bool &extra_metadata = false) { return std::make_shared(StringToChar(dataset_dir), StringToChar(annotation_file), StringToChar(task), decode, sampler, cache, extra_metadata); } class CSVDataset : public Dataset { public: explicit CSVDataset(const std::vector> &dataset_files, char field_delim, const std::vector> &column_defaults, const std::vector> &column_names, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache); ~CSVDataset() = default; }; inline std::shared_ptr CSV(const std::vector &dataset_files, char field_delim = ',', const std::vector> &column_defaults = {}, const std::vector &column_names = {}, int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, const std::shared_ptr &cache = nullptr) { return std::make_shared(VectorStringToChar(dataset_files), field_delim, column_defaults, VectorStringToChar(column_names), num_samples, shuffle, num_shards, shard_id, cache); } class ImageFolderDataset : public Dataset { public: explicit ImageFolderDataset(const std::vector &dataset_dir, bool decode, const std::shared_ptr &sampler, const std::set> &extensions, const std::map, int32_t> &class_indexing, const std::shared_ptr &cache); explicit ImageFolderDataset(const std::vector &dataset_dir, bool decode, const Sampler *sampler, const std::set> &extensions, const std::map, int32_t> &class_indexing, const std::shared_ptr &cache); explicit ImageFolderDataset(const std::vector &dataset_dir, bool decode, const std::reference_wrapper sampler, const std::set> &extensions, const std::map, int32_t> &class_indexing, const std::shared_ptr &cache); ~ImageFolderDataset() = default; }; inline std::shared_ptr ImageFolder( const std::string &dataset_dir, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), const std::set &extensions = {}, const std::map &class_indexing = {}, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), decode, sampler, SetStringToChar(extensions), MapStringToChar(class_indexing), cache); } inline std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode, const Sampler *sampler, const std::set &extensions = {}, const std::map &class_indexing = {}, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), decode, sampler, SetStringToChar(extensions), MapStringToChar(class_indexing), cache); } inline std::shared_ptr ImageFolder(const std::string &dataset_dir, bool decode, const std::reference_wrapper sampler, const std::set &extensions = {}, const std::map &class_indexing = {}, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), decode, sampler, SetStringToChar(extensions), MapStringToChar(class_indexing), cache); } class ManifestDataset : public Dataset { public: explicit ManifestDataset(const std::vector &dataset_file, const std::vector &usage, const std::shared_ptr &sampler, const std::map, int32_t> &class_indexing, bool decode, const std::shared_ptr &cache); explicit ManifestDataset(const std::vector &dataset_file, const std::vector &usage, const Sampler *sampler, const std::map, int32_t> &class_indexing, bool decode, const std::shared_ptr &cache); explicit ManifestDataset(const std::vector &dataset_file, const std::vector &usage, const std::reference_wrapper sampler, const std::map, int32_t> &class_indexing, bool decode, const std::shared_ptr &cache); ~ManifestDataset() = default; }; inline std::shared_ptr Manifest( const std::string &dataset_file, const std::string &usage = "train", const std::shared_ptr &sampler = std::make_shared(), const std::map &class_indexing = {}, bool decode = false, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_file), StringToChar(usage), sampler, MapStringToChar(class_indexing), decode, cache); } inline std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage, const Sampler *sampler, const std::map &class_indexing = {}, bool decode = false, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_file), StringToChar(usage), sampler, MapStringToChar(class_indexing), decode, cache); } inline std::shared_ptr Manifest(const std::string &dataset_file, const std::string &usage, const std::reference_wrapper sampler, const std::map &class_indexing = {}, bool decode = false, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_file), StringToChar(usage), sampler, MapStringToChar(class_indexing), decode, cache); } class MindDataDataset : public Dataset { public: explicit MindDataDataset(const std::vector &dataset_file, const std::vector> &columns_list, const std::shared_ptr &sampler, const nlohmann::json *padded_sample, int64_t num_padded, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr); explicit MindDataDataset(const std::vector &dataset_file, const std::vector> &columns_list, const Sampler *sampler, const nlohmann::json *padded_sample, int64_t num_padded, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr); explicit MindDataDataset(const std::vector &dataset_file, const std::vector> &columns_list, const std::reference_wrapper sampler, const nlohmann::json *padded_sample, int64_t num_padded, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr); explicit MindDataDataset(const std::vector> &dataset_files, const std::vector> &columns_list, const std::shared_ptr &sampler, const nlohmann::json *padded_sample, int64_t num_padded, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr); explicit MindDataDataset(const std::vector> &dataset_files, const std::vector> &columns_list, const Sampler *sampler, const nlohmann::json *padded_sample, int64_t num_padded, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr); explicit MindDataDataset(const std::vector> &dataset_files, const std::vector> &columns_list, const std::reference_wrapper sampler, const nlohmann::json *padded_sample, int64_t num_padded, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr); ~MindDataDataset() = default; }; inline std::shared_ptr MindData( const std::string &dataset_file, const std::vector &columns_list = {}, const std::shared_ptr &sampler = std::make_shared(), nlohmann::json *padded_sample = nullptr, int64_t num_padded = 0, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_file), VectorStringToChar(columns_list), sampler, padded_sample, num_padded, shuffle_mode, cache); } inline std::shared_ptr MindData(const std::string &dataset_file, const std::vector &columns_list, const Sampler *sampler, nlohmann::json *padded_sample = nullptr, int64_t num_padded = 0, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_file), VectorStringToChar(columns_list), sampler, padded_sample, num_padded, shuffle_mode, cache); } inline std::shared_ptr MindData(const std::string &dataset_file, const std::vector &columns_list, const std::reference_wrapper sampler, nlohmann::json *padded_sample = nullptr, int64_t num_padded = 0, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_file), VectorStringToChar(columns_list), sampler, padded_sample, num_padded, shuffle_mode, cache); } inline std::shared_ptr MindData( const std::vector &dataset_files, const std::vector &columns_list = {}, const std::shared_ptr &sampler = std::make_shared(), nlohmann::json *padded_sample = nullptr, int64_t num_padded = 0, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr) { return std::make_shared(VectorStringToChar(dataset_files), VectorStringToChar(columns_list), sampler, padded_sample, num_padded, shuffle_mode, cache); } inline std::shared_ptr MindData(const std::vector &dataset_files, const std::vector &columns_list, const Sampler *sampler, nlohmann::json *padded_sample = nullptr, int64_t num_padded = 0, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr) { return std::make_shared(VectorStringToChar(dataset_files), VectorStringToChar(columns_list), sampler, padded_sample, num_padded, shuffle_mode, cache); } inline std::shared_ptr MindData(const std::vector &dataset_files, const std::vector &columns_list, const std::reference_wrapper sampler, nlohmann::json *padded_sample = nullptr, int64_t num_padded = 0, ShuffleMode shuffle_mode = ShuffleMode::kGlobal, const std::shared_ptr &cache = nullptr) { return std::make_shared(VectorStringToChar(dataset_files), VectorStringToChar(columns_list), sampler, padded_sample, num_padded, shuffle_mode, cache); } class MnistDataset : public Dataset { public: explicit MnistDataset(const std::vector &dataset_dir, const std::vector &usage, const std::shared_ptr &sampler, const std::shared_ptr &cache); explicit MnistDataset(const std::vector &dataset_dir, const std::vector &usage, const Sampler *sampler, const std::shared_ptr &cache); explicit MnistDataset(const std::vector &dataset_dir, const std::vector &usage, const std::reference_wrapper sampler, const std::shared_ptr &cache); ~MnistDataset() = default; }; inline std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage = "all", const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage, const Sampler *sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr Mnist(const std::string &dataset_dir, const std::string &usage, const std::reference_wrapper sampler, const std::shared_ptr &cache = nullptr) { return std::make_shared(StringToChar(dataset_dir), StringToChar(usage), sampler, cache); } inline std::shared_ptr operator+(const std::shared_ptr &datasets1, const std::shared_ptr &datasets2) { return std::make_shared(std::vector({datasets1, datasets2})); } class RandomDataDataset : public Dataset { public: RandomDataDataset(const int32_t &total_rows, std::shared_ptr schema, const std::vector> &columns_list, std::shared_ptr cache); RandomDataDataset(const int32_t &total_rows, const std::vector &schema_path, const std::vector> &columns_list, std::shared_ptr cache); ~RandomDataDataset() = default; }; template > std::shared_ptr RandomData(const int32_t &total_rows = 0, const T &schema = nullptr, const std::vector &columns_list = {}, const std::shared_ptr &cache = nullptr) { std::shared_ptr ds; if constexpr (std::is_same::value || std::is_same>::value) { std::shared_ptr schema_obj = schema; ds = std::make_shared(total_rows, std::move(schema_obj), VectorStringToChar(columns_list), cache); } else { ds = std::make_shared(total_rows, StringToChar(schema), VectorStringToChar(columns_list), cache); } return ds; } class TextFileDataset : public Dataset { public: explicit TextFileDataset(const std::vector> &dataset_files, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, const std::shared_ptr &cache); ~TextFileDataset() = default; }; inline std::shared_ptr TextFile(const std::vector &dataset_files, int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, const std::shared_ptr &cache = nullptr) { return std::make_shared(VectorStringToChar(dataset_files), num_samples, shuffle, num_shards, shard_id, cache); } class TFRecordDataset : public Dataset { public: TFRecordDataset(const std::vector> &dataset_files, const std::vector &schema, const std::vector> &columns_list, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache); TFRecordDataset(const std::vector> &dataset_files, std::shared_ptr schema, const std::vector> &columns_list, int64_t num_samples, ShuffleMode shuffle, int32_t num_shards, int32_t shard_id, bool shard_equal_rows, std::shared_ptr cache); ~TFRecordDataset() = default; }; template > std::shared_ptr TFRecord(const std::vector &dataset_files, const T &schema = nullptr, const std::vector &columns_list = {}, int64_t num_samples = 0, ShuffleMode shuffle = ShuffleMode::kGlobal, int32_t num_shards = 1, int32_t shard_id = 0, bool shard_equal_rows = false, const std::shared_ptr &cache = nullptr) { std::shared_ptr ds = nullptr; if constexpr (std::is_same::value || std::is_same>::value) { std::shared_ptr schema_obj = schema; ds = std::make_shared(VectorStringToChar(dataset_files), std::move(schema_obj), VectorStringToChar(columns_list), num_samples, shuffle, num_shards, shard_id, shard_equal_rows, cache); } else { std::string schema_path = schema; if (!schema_path.empty()) { struct stat sb; int rc = stat(schema_path.c_str(), &sb); if (rc != 0) { return nullptr; } } ds = std::make_shared(VectorStringToChar(dataset_files), StringToChar(schema_path), VectorStringToChar(columns_list), num_samples, shuffle, num_shards, shard_id, shard_equal_rows, cache); } return ds; } class VOCDataset : public Dataset { public: explicit VOCDataset(const std::vector &dataset_dir, const std::vector &task, const std::vector &usage, const std::map, int32_t> &class_indexing, bool decode, const std::shared_ptr &sampler, const std::shared_ptr &cache, bool extra_metadata); explicit VOCDataset(const std::vector &dataset_dir, const std::vector &task, const std::vector &usage, const std::map, int32_t> &class_indexing, bool decode, const Sampler *sampler, const std::shared_ptr &cache, bool extra_metadata); explicit VOCDataset(const std::vector &dataset_dir, const std::vector &task, const std::vector &usage, const std::map, int32_t> &class_indexing, bool decode, const std::reference_wrapper sampler, const std::shared_ptr &cache, bool extra_metadata); ~VOCDataset() = default; }; inline std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task = "Segmentation", const std::string &usage = "train", const std::map &class_indexing = {}, bool decode = false, const std::shared_ptr &sampler = std::make_shared(), const std::shared_ptr &cache = nullptr, bool extra_metadata = false) { return std::make_shared(StringToChar(dataset_dir), StringToChar(task), StringToChar(usage), MapStringToChar(class_indexing), decode, sampler, cache, extra_metadata); } inline std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task, const std::string &usage, const std::map &class_indexing, bool decode, const Sampler *sampler, const std::shared_ptr &cache = nullptr, bool extra_metadata = false) { return std::make_shared(StringToChar(dataset_dir), StringToChar(task), StringToChar(usage), MapStringToChar(class_indexing), decode, sampler, cache, extra_metadata); } inline std::shared_ptr VOC(const std::string &dataset_dir, const std::string &task, const std::string &usage, const std::map &class_indexing, bool decode, const std::reference_wrapper sampler, const std::shared_ptr &cache = nullptr, bool extra_metadata = false) { return std::make_shared(StringToChar(dataset_dir), StringToChar(task), StringToChar(usage), MapStringToChar(class_indexing), decode, sampler, cache, extra_metadata); } std::shared_ptr CreateDatasetCacheCharIF(session_id_type id, uint64_t mem_sz, bool spill, std::optional> hostname = std::nullopt, std::optional port = std::nullopt, std::optional num_connections = std::nullopt, std::optional prefetch_sz = std::nullopt); inline std::shared_ptr CreateDatasetCache(session_id_type id, uint64_t mem_sz, bool spill, std::optional hostname = std::nullopt, std::optional port = std::nullopt, std::optional num_connections = std::nullopt, std::optional prefetch_sz = std::nullopt) { std::optional> hostname_c = std::nullopt; if (hostname != std::nullopt) { hostname_c = std::vector(hostname->begin(), hostname->end()); } return CreateDatasetCacheCharIF(id, mem_sz, spill, hostname_c, port, num_connections, prefetch_sz); } inline std::shared_ptr Zip(const std::vector> &datasets) { return std::make_shared(datasets); } } // namespace dataset } // namespace mindspore #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_DATASETS_H_