1. Bert PB model export

# encoding:utf-8
import shutil

from model.create_model import BertClassifier
from model.model_config import BertConfig
import os
import tensorflow as tf
import os


def export_model(checkpoint_dir, epoch, model_version, vocab_path, bert_config_path, label_type_path) :
    export_path_base = os.path.join(checkpoint_dir, "serving_output")
    if os.path.exists(export_path_base):
        shutil.rmtree(export_path_base)

    with tf.compat.v1.get_default_graph().as_default():
        inputs = {}
        inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32)
        inputs["input_mask"] = tf.keras.Input(shape=(128), dtype=tf.int32)
        inputs["input_type_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32)

        bert_config = BertConfig.from_json_file_v2(bert_config_path, vocab_path)
        labels = [c[1:].strip() for c in open(label_type_path).readlines()]

        bert_classifier = BertClassifier(bert_config, 128.len(labels), output="predictions")

        if epoch < 10:
            latest_ckpt = "%s/checkpoint-0%d" % (checkpoint_dir, epoch)
        else:
            latest_ckpt = "%s/checkpoint-%d" % (checkpoint_dir, epoch)
        print(latest_ckpt)

        bert_classifier.load_weights(latest_ckpt)

        bert_classifier.load_weights(latest_ckpt)
        bert_classifier._set_inputs(inputs)

        print('export dir:', export_path_base)
        config_dir = os.path.join(checkpoint_dir, 'configs')
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        print('config dir :', config_dir)

        shutil.copy(bert_config_path, os.path.join(config_dir, os.path.basename(bert_config_path)))
        shutil.copy(vocab_path, os.path.join(config_dir, os.path.basename(vocab_path)))

        export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version)))
        print('export model path :', export_path)
        bert_classifier.save(export_path, save_format='tf')


if __name__ == '__main__':
    os.environ['CUDA_VISIBLE_DEVICES'] = "0"

    # v4.3
    checkpoint_path = '... '
    vocab_path = '... txt'
    bert_config_path = "... json"
    epoch = 4
    model_version = 5
    export_model(checkpoint_path, epoch, model_version, vocab_path, bert_config_path, label_type_path)

Copy the code

2. Go invokes tF-GPU

Plan 1: Invalid after verification, very painful

model, err := tf.LoadSavedModel(modelPath, []string{"serve"}, SessionOptions)
Copy the code

If SessionOptions is passed nil, it will fill up the video memory by default. To control the video memory size, you need to set this parameter.

SessionOptions is a bit trickier to build: you need to compile data in protobuf format

type SessionOptions struct {
	// Target indicates the TensorFlow runtime to connect to.
	//
	// If 'target' is empty or unspecified, the local TensorFlow runtime
	// implementation will be used. Otherwise, the TensorFlow engine
	// defined by 'target' will be used to perform all computations.
	//
	// "target" can be either a single entry or a comma separated list
	// of entries. Each entry is a resolvable address of one of the
	// following formats:
	// local
	// ip:port
	// host:port
	/ /... other system-specific formats to identify tasks and jobs ...
	//
	// NOTE: at the moment 'local' maps to an in-process service-based
	// runtime.
	//
	// Upon creation, a single session affines itself to one of the
	// remote processes, with possible load balancing choices when the
	// "target" resolves to a list of possible processes.
	//
	// If the session disconnects from the remote process during its
	// lifetime, session calls may fail immediately.
	Target string

	// Config is a binary-serialized representation of the
	// tensorflow.ConfigProto protocol message
	// (https://www.tensorflow.org/code/tensorflow/core/protobuf/config.proto).
	Config []byte
}
Copy the code
message GPUOptions {
  // Fraction of the available GPU memory to allocate for each process.
  // 1 means to allocate all of the GPU memory, 0.5 means the process
  // allocates up to ~50% of the available GPU memory.
  //
  // GPU memory is pre-allocated unless the allow_growth option is enabled.
  //
  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
  // the amount of memory available on the GPU device by using host memory as a
  // swap space. Accessing memory not available on the device will be
  // significantly slower as that would require memory transfer between the host
  // and the device. Options to reduce the memory requirement should be
  // considered before enabling this option as this may come with a negative
  // performance impact. Oversubscription using the unified memory requires
  // Pascal class or newer GPUs and it is currently only supported on the Linux
  // operating system. See
  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
  // for the detailed requirements.
  double per_process_gpu_memory_fraction = 1;

  // If true, the allocator does not pre-allocate the entire specified
  // GPU memory region, instead starting small and growing as needed.
  bool allow_growth = 4;

  // The type of GPU allocation strategy to use.
  //
  // Allowed values:
  // "": The empty string (default) uses a system-chosen default
  // which may change over time.
  //
  // "BFC": A "Best-fit with coalescing" algorithm, simplified from a
  // version of dlmalloc.
  string allocator_type = 2;

  // Delay deletion of up to this many bytes to reduce the number of
  // interactions with gpu driver code. If 0, the system chooses
  // a reasonable default (several MBs).
  int64 deferred_deletion_bytes = 3;

  // A comma-separated list of GPU ids that determines the 'visible'
  // to 'virtual' mapping of GPU devices. For example, if TensorFlow
  // can see 8 GPU devices in the process, and one wanted to map
  // visible GPU devices 5 and 3 as "/device:GPU:0", and "/device:GPU:1",
  Specify this field as "5,3". This field is similar in
  // spirit to the CUDA_VISIBLE_DEVICES environment variable, except
  // it applies to the visible GPU devices in the process.
  //
  // NOTE:
  // 1. The GPU driver provides the process with the visible GPUs
  // in an order which is not guaranteed to have any correlation to
  // the *physical* GPU id in the machine. This field is used for
  // remapping "visible" to "virtual", which means this operates only
  // after the process starts. Users are required to use vendor
  // specific mechanisms (e.g., CUDA_VISIBLE_DEVICES) to control the
  // physical to visible device mapping prior to invoking TensorFlow.
  // 2. In the code, the ids in this list are also called "platform GPU id"s,
  // and the 'virtual' ids of GPU devices (i.e. the ids in the device
  // name "/device:GPU:
      
       ") are also called "TF GPU id"s. Please
      
  // refer to third_party/tensorflow/core/common_runtime/gpu/gpu_id.h
  // for more information.
  string visible_device_list = 5;

  // In the event polling loop sleep this many microseconds between
  // PollEvents calls, when the queue is not empty. If value is not
  // set or set to 0, gets set to a non-zero default.
  int32 polling_active_delay_usecs = 6;

  // This field is deprecated and ignored.
  int32 polling_inactive_delay_msecs = 7;

  // Force all tensors to be gpu_compatible. On a GPU-enabled TensorFlow,
  // enabling this option forces all CPU tensors to be allocated with Cuda
  // pinned memory. Normally, TensorFlow will infer which tensors should be
  // allocated as the pinned memory. But in case where the inference is
  // incomplete, this option can significantly speed up the cross-device memory
  // copy performance as long as it fits the memory.
  // Note that this option is not something that should be
  // enabled by default for unknown or very large models, since all Cuda pinned
  // memory is unpageable, having too much pinned memory might negatively impact
  // the overall host system performance.
  bool force_gpu_compatible = 8;

  message Experimental {
    // Configuration for breaking down a visible GPU into multiple "virtual"
    // devices.
    message VirtualDevices {
      // Per "virtual" device memory limit, in MB. The number of elements in
      // the list is the number of virtual devices to create on the
      // corresponding visible GPU (see "virtual_devices" below).
      // If empty, it will create single virtual device taking all available
      // memory from the device.
      //
      // For the concept of "visible" and "virtual" GPU, see the comments for
      // "visible_device_list" above for more information.
      repeated float memory_limit_mb = 1;

      // Priority values to use with the virtual devices. Use the cuda function
      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
      // priority.
      //
      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
      // least priority and -1 for greatest priority.
      //
      // If this field is not specified, then the virtual devices will be
      // created with the default. If this field has values set, then the size
      // of this must match with the above memory_limit_mb.
      repeated int32 priority = 2;
    }

    // The multi virtual device settings. If empty (not set), it will create
    // single virtual device on each visible GPU, according to the settings
    // in "visible_device_list" above. Otherwise, the number of elements in the
    // list must be the same as the number of visible GPUs (after
    // "visible_device_list" filtering if it is set), and the string represented
    // device names (e.g. /device:GPU:<id>) will refer to the virtual
    // devices and have the <id> field assigned sequentially starting from 0,
    // according to the order they appear in this list and the "memory_limit"
    // list inside each element. For example,
    //   visible_device_list = "1,0"
    // virtual_devices { memory_limit: 1GB memory_limit: 2GB }
    // virtual_devices {}
    // will create three virtual devices as:
    // /device:GPU:0 -> visible GPU 1 with 1GB memory
    // /device:GPU:1 -> visible GPU 1 with 2GB memory
    // /device:GPU:2 -> visible GPU 0 with all available memory
    //
    // NOTE:
    // 1. It's invalid to set both this and "per_process_gpu_memory_fraction"
    // at the same time.
    // 2. Currently this setting is per-process, not per-session. Using
    // different settings in different sessions within same process will
    // result in undefined behavior.
    repeated VirtualDevices virtual_devices = 1;

    // If true, uses CUDA unified memory for memory allocations. If
    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
    // memory is used regardless of the value for this field. See comments for
    // per_process_gpu_memory_fraction field for more details and requirements
    // of the unified memory. This option is useful to oversubscribe memory if
    // multiple processes are sharing a single GPU while individually using less
    // than 1.0 per process memory fraction.
    bool use_unified_memory = 2;

    // If > 1, the number of device-to-device copy streams to create
    // for each GPUDevice. Default value is 0, which is automatically
    // converted to 1.
    int32 num_dev_to_dev_copy_streams = 3;

    // If non-empty, defines a good GPU ring order on a single worker based on
    // device interconnect. This assumes that all workers have the same GPU
    Specify as a comma-separated string, e.g. "3,2,1,0,7,6,5,4"
    // This ring order is used by the RingReducer implementation of
    // CollectiveReduce, and serves as an override to automatic ring order
    // generation in OrderTaskDeviceMap() during CollectiveParam resolution.
    string collective_ring_order = 4;

    // If true then extra work is done by GPUDevice and GPUBFCAllocator to
    // keep track of when GPU memory is freed and when kernels actually
    // complete so that we can know when a nominally free memory chunk
    // is really not subject to pending use.
    bool timestamped_allocator = 5;

    // reserved id: 6

    // Parameters for GPUKernelTracker. By default no kernel tracking is done.
    // Note that timestamped_allocator is only effective if some tracking is
    // specified.
    //
    // If kernel_tracker_max_interval = n > 0, then a tracking event
    // is inserted after every n kernels without an event.
    int32 kernel_tracker_max_interval = 7;
    // If kernel_tracker_max_bytes = n > 0, then a tracking event is
    // inserted after every series of kernels allocating a sum of
    // memory >= n. If one kernel allocates b * n bytes, then one
    // event will be inserted after it, but it will count as b against
    // the pending limit.
    int32 kernel_tracker_max_bytes = 8;
    // If kernel_tracker_max_pending > 0 then no more than this many
    // tracking events can be outstanding at a time. An attempt to
    // launch an additional kernel will stall until an event
    // completes.
    int32 kernel_tracker_max_pending = 9;

    // BFC Allocator can return an allocated chunk of memory upto 2x the
    // requested size. For virtual devices with tight memory constraints, and
    // proportionately large allocation requests, this can lead to a significant
    // reduction in available memory. The threshold below controls when a chunk
    // should be split if the chunk size exceeds requested memory size. It is
    // expressed as a fraction of total available memory for the tf device. For
    // example setting it to 0.05 would imply a chunk needs to be split if its
    // size exceeds the requested memory by 5% of the total virtual device/gpu
    // memory size.
    double internal_fragmentation_fraction = 10;

    // When true, use CUDA cudaMallocAsync API instead of TF gpu allocator.
    bool use_cuda_malloc_async = 11;
  }

  // Everything inside experimental is subject to change and is not subject
  // to API stability guarantees in
  // https://www.tensorflow.org/guide/version_compat.
  Experimental experimental = 9;
}
Copy the code

Option 2: YespythonSetting mode of

This can be set directly in Python

os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"
Copy the code

So the idea is to start the container directly with this environment variable and see if it works

docker run -e TF_FORCE_GPU_ALLOW_GROWTH=true
Copy the code

Or add export TF_FORCE_GPU_ALLOW_GROWTH=true to the dockerfile

3. Tips for downloading models

    model = TFAutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
Copy the code

Model parameters are automatically downloaded, and the downloaded data is stored in model_args.cache_dir, but this has the disadvantage of having meaningless file names for models, which are not readable

Using manual download:

git lfs install
git clone https://huggingface.co/hfl/chinese-roberta-wwm-ext-large
Copy the code

Then delete the.git folder in the folder and specify model_name_or_path as the local path

  "model_name_or_path": "./cache/chinese-roberta-wwm-ext-large",
Copy the code

It’s refreshing

4. Chinese-roberta-wwm-ext-large training text classification

The starting point is to train a large model and train a high-precision model to pick out suspicious dirty data. As a result, the effect of training 10 EPOchs is very poor, and still trying to find the reason…

5. View the input and output of the exported PB model

Method 1: more troublesome methods available online

Step1 find the saved_model_cli path:

from tensorflow.python.tools import saved_model_cli print(saved_model_cli.__file__) # output: / XXXX XXX/local/lib/python3.6 / site - packages/tensorflow/python/tools/saved_model_cli pyCopy the code

Step2 run the following command line:

CD/XXXX XXX /. / local/lib/python3.6 / site - packages/tensorflow/python/tools/python saved_model_cli py show - dir directory containing pb model --allCopy the code

output:

Mode 2 Code execution:

Fill in the corresponding parameters of the method by reading the source code

import argparse

from tensorflow.python.tools import saved_model_cli


def print_model_input_output(model_dir) :
    args = argparse.ArgumentParser()
    args.all = True
    args.dir = model_dir
    saved_model_cli.show(args)


if __name__ == '__main__':
    model_dir = '/ data_nvme DXQ/text_filter_dl bert_tf2 / output_model/v4.3 / serving_output0708/1'
    print_model_input_output(model_dir)

Copy the code

5. Specify the model input name

  with tf.compat.v1.get_default_graph().as_default():
        inputs = {}
        inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_word_ids')
        inputs["input_mask"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_mask')
        inputs["input_type_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_type_ids')

        bert_config = BertConfig.from_json_file_v2(bert_config_path, vocab_path)
        labels = [c[1:].strip() for c in open(label_type_path).readlines()]

        bert_classifier = BertClassifier(bert_config, 128, len(labels), output="predictions")

        if epoch < 10:
            latest_ckpt = "%s/checkpoint-0%d" % (checkpoint_dir, epoch)
        else:
            latest_ckpt = "%s/checkpoint-%d" % (checkpoint_dir, epoch)
        print(latest_ckpt)

        # bert_classifier.load_weights(latest_ckpt)

        bert_classifier.load_weights(latest_ckpt)
        bert_classifier._set_inputs(inputs)
        print('export dir:', export_path_base)
        config_dir = os.path.join(checkpoint_dir, 'configs')
        if not os.path.exists(config_dir):
            os.mkdir(config_dir)
        print('config dir :', config_dir)

        shutil.copy(bert_config_path, os.path.join(config_dir, os.path.basename(bert_config_path)))
        shutil.copy(vocab_path, os.path.join(config_dir, os.path.basename(vocab_path)))
        bert_classifier.summary()
        export_path = os.path.join(tf.compat.as_bytes(export_path_base), tf.compat.as_bytes(str(model_version)))
        print('export model path :', export_path)
        bert_classifier.save(export_path, save_format='tf')
Copy the code

Note that the name parameter needs to be specified for this to take effect

inputs["input_word_ids"] = tf.keras.Input(shape=(128), dtype=tf.int32, name='input_word_ids')

Copy the code

6. PLT displays Chinese characters

Blog.csdn.net/Disany/arti…

7. Experience in text classification

  • Pre-training the modelBERT-wwm-extThan usingBERTThe classification effect is better
  • Oversampling will reduce the sample ratiorecall