您好,请问jieba现在有没有支持windows系统的并行模块呢?

qlckcl4x  于 6个月前  发布在  Windows
关注(0)|答案(1)|浏览(134)

:将目标文本按行分隔后,把各行文本分配到多个python进程并行分词,然后归并结果,从而获得分词速度的可观提升
基于python自带的multiprocessing模块,目前暂不支持windows

ny6fqffe

ny6fqffe1#

You can try this. It works in Windows.

from path import Path
from multiprocessing import Pool
import argparse
import time

LINE_PER_CORE = 5000
NUM_CORE = 30
FLOOR_COUNT = 10
CEIL_COUNT = 200

import jieba

def process_one(_in):
    r_list = []
    for l in _in:
        new_l = ' '.join(jieba.cut(l))
        r_list.append(new_l.strip())
    return r_list

def do(l_list, writer):
    pool = Pool(NUM_CORE)
    r_list=pool.map(process_one,[l_list[it:it+LINE_PER_CORE] for it in range(0,len(l_list),LINE_PER_CORE)])
    pool.close()
    pool.join()
    for lr in r_list:
        for line in lr:
            writer.write(line + '\n')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-i","--input", help="input folder", default=".")
    parser.add_argument("-o", "--output", help="output folder", default="w_process")
    parser.add_argument("--LINE_PER_CORE", help="# lines per core", type=int, default=20000)
    parser.add_argument("--NUM_CORE", help="# of cores", type=int, default=30)
    parser.add_argument("--coding", type=str, default="utf-8")

    args = parser.parse_args()
    print("Args :", args)
    input_folder = args.input
    output_folder = args.output
    LINE_PER_CORE = args.LINE_PER_CORE
    NUM_CORE = args.NUM_CORE
    coding = args.coding

    if not Path(output_folder).exists():
        Path(output_folder).mkdir()
    for f in Path(input_folder).files('*.txt'):
        print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime()))
        with open(output_folder + '/%s.output.txt' % (f.namebase,),'w', encoding='utf-8') as f_out:
            with open(f.abspath(),'r', encoding='utf-8') as f_in:
                l_list=[]
                all_dict = {}
                for l in f_in:
                    if len(l_list)<NUM_CORE*LINE_PER_CORE:
                        l_list.append(l)
                    else:
                        do(l_list, f_out)
                        print(f.basename(), time.strftime('%Y-%m-%d %X', time.localtime()))
                        l_list=[]
                if len(l_list)>0:
                    do(l_list, f_out)
    print(time.strftime('%Y-%m-%d %X', time.localtime()))

相关问题