Two solutions to stutter participle Memory Error

The background,

Recently, Gensim Word2vec was used to train synonym models according to specific corpus, and the input corpus for model training required to be the file after word segmentation. In the process of word segmentation on the original corpus file, Memory Error occurred due to the large volume of the corpus file (nearly 50 million data). To solve this problem, the following two solutions are provided. At the same time, the code shows the filtering of word parts of speech, stop words and punctuation marks during word segmentation. Finally, the model training code according to the segmentation file is attached.

Second, solutions

Solution: First, avoid loading all data at one time when reading file data, single thread loading processing data by line; The second is to store a large amount of data in a file split into multiple, multi-threaded parallel word segmentation.

2.1 The first solution code that loads the processing data line by line

# -*- coding: utf-8 -*-
Save word segmentation from original text to new file.
import jieba
import numpy as np
import jieba.posseg as pseg
import re

filePath='/data/work/keyword/work_data/work_title_description.csv'
fileSegWordDonePath ='/data/work/keyword/work_cutdata/corpus_line.txt'

# stop loading words
stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
    return stopwords
    
Print the List in Chinese
def PrintListChinese(list):
    for i in range(len(list)):
        print (list[i])
        
Read the contents of the file into the list
fileTrainRead = []
with open(filePath,'r') as fileTrainRaw:
    for line in fileTrainRaw:  Read file by line
        fileTrainRead.append(line)
    
# jieba saves the word in the list
fileTrainSeg=[]
jieba.enable_paddle() 
stopwords = stopwordslist(stop_word_path)  # here is the path to load stop words
outstr = ' '
for i in range(len(fileTrainRead)):
    for x in pseg.cut(fileTrainRead[i][0:],use_paddle=True) :# Select the specified part of speech words
        if x.flag == 'n' or x.flag == 'nw' or x.flag == 'nz' or x.flag.startswith('TIME') or x.flag.startswith('t') :if x.word not in stopwords:
                # Remove punctuation
                y = re.sub(R "[0-9 \ s + \. \! \ / _ $% ^ * ()?;; : - [] + \ '\'] + | [+ -!; :.?, ~ @ # $% & * ()] +"."", x.word)
                ify ! ='\t':
                    outstr += y 
                    outstr += "" 
    if i % 100= =0:
        print(i)                    
fileTrainSeg.append([outstr])

Save the segmentation result to a file
with open(fileSegWordDonePath,'w',encoding='utf-8') as fW:
    for i in range(len(fileTrainSeg)):
        fW.write(fileTrainSeg[i][0])
        fW.write('\n')
      
Copy the code

## 2.2 The second solution code splits a file with a large amount of data into multiple filesCopy the code
    # -*-coding:utf-8 -*-
    import jieba.analyse
    import jieba
    import os
    import jieba.posseg as pseg
    
    jieba.enable_parallel(4)
    raw_data_path = '/data/work/keyword/work_data/'
    cut_data_path = '/data/work/keyword/work_cutdata/'
    stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
        return stopwords
    def cut_word(raw_data_path, cut_data_path ):
        Read multiple data files in this path
        data_file_list = os.listdir(raw_data_path)
        corpus = ' '
        temp = 0
        for file in data_file_list:
            with open(raw_data_path + file,'rb') as f:
                print(temp+1)
                temp +=1
                document = f.read()
                document_cut = jieba.cut(document, cut_all=False)
                result = ' '.join(document_cut)
                corpus += result
        with open(cut_data_path + 'corpus.txt'.'w+', encoding='utf-8') as f:
            f.write(corpus)  Read and write in the same way
        stopwords = stopwordslist(stop_word_path)  # path to load stop words
        with open(cut_data_path + 'corpus.txt'.'r', encoding='utf-8') as f:
            document_cut = f.read()
            outstr = ' '
            for word in document_cut:
                if word not in stopwords:
                    ifword ! ='\t':
                        outstr += word
                        outstr += ""
        with open(cut_data_path + 'corpus1.txt'.'w+', encoding='utf-8') as f:
                f.write(outstr)  Read and write in the same way
    if __name__ == "__main__":
        cut_word(raw_data_path, cut_data_path )
Copy the code

3. Use Gensim Word2vec to train the model

"" gensim word2vec get word vector """
import warnings
import logging
import os.path
import sys
import multiprocessing
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# ignore warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0]) Read the name of the current file
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # inp is the input corpus, outP1 is the output model, and outp2 is the vector model
    inp = '/data/work/keyword/work_cutdata/corpus_line.txt'
    out_model = '/data/work/keyword/word2vec_model/work_title_description.model'
    out_vector = '/data/work/keyword/word2vec_model/work_title_description.vector'
    # Train the Skip-Gram model
    model = Word2Vec(LineSentence(inp), size=50, window=5, min_count=5,
                     workers=multiprocessing.cpu_count())
    # Save the model
    model.save(out_model)
    Save the word vector
    model.wv.save_word2vec_format(out_vector, binary=False)

Copy the code

Four,

In the process of development, the first way of reading file data by line is finally used for word segmentation and training to get the model. The second method reads multiple files in the directory. The test read 20 files separately, and the Memory Error problem no longer appears.

Reference:

Blog.csdn.net/lilong11719… >

Blog.csdn.net/qq_35273499…

Emma, Engineer