import pandas import json import numpy as np import asyncio from subdir import db as dbservice from subdir import service import os import hashlib from sentence_transformers import SentenceTransformer model = SentenceTransformer("BAAI/bge-small-zh-v1.5") from pymongo import MongoClient client2 = MongoClient() def getParent(client, zhuanye, debh): db = client["dinge"] collection = db["de-collection"] if zhuanye == 0: return "" if zhuanye == 10: for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }): return post['parent'] if zhuanye == 20: for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }): return post['parent'] if zhuanye == 30: for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }): return post['parent'] if zhuanye == 40: for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }): return post['parent'] if zhuanye == 50: for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }): return post['parent'] if zhuanye == 60: for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }): return post['parent'] return "" def getLabel(client, zhuanye, debh): db = client["dinge"] collection = db["de-collection"] if zhuanye == 0: return "" if zhuanye == 10: for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 20: for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 30: for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 40: for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 50: for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 60: for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }): return post['parent'] + " " + post['GCLMC'] return "" def getSingleDeXilie_(model, client, zhuanye, debh, mc): db = client["dinge"] collection = db["de-collection"] count = 0 actual_zhuanye = [] mcs = [] parents = [] if "附注" in debh: position = debh.find("附注") debh = debh[:position] if "*" in debh: position = debh.find("*") debh = debh[:position] if debh.endswith("换"): debh = debh[:-1] if "[" in debh: position = debh.find("[") debh = debh[:position-1] for post in collection.find({'DEBH': debh}): ##print(post) actual_zhuanye.append(post['zhuanye']) mcs.append(post['GCLMC']) parents.append(post['parent']) count = count + 1 if count == 0: return None, None, None, None, None, None, None, 0 elif count == 1: if actual_zhuanye[0] == '土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh) return A1, A2, A3, A4, A5, A6, A7, 10 elif actual_zhuanye[0] == '市政': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh) return A1, A2, A3, A4, A5, A6, A7, 20 elif actual_zhuanye[0] == '安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh) return A1, A2, A3, A4, A5, A6, A7, 30 elif actual_zhuanye[0] == '园林': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh) return A1, A2, A3, A4, A5, A6, A7, 40 elif actual_zhuanye[0] == '修缮工程土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh) return A1, A2, A3, A4, A5, A6, A7, 50 elif actual_zhuanye[0] == '修缮工程安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh) return A1, A2, A3, A4, A5, A6, A7, 60 else: hit = False for i in range(0, count): if mcs[i] in mc: hit = True actual_zhuanye[0] = actual_zhuanye[i] if hit: if actual_zhuanye[0] == '土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh) return A1, A2, A3, A4, A5, A6, A7, 10 elif actual_zhuanye[0] == '市政': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh) return A1, A2, A3, A4, A5, A6, A7, 20 elif actual_zhuanye[0] == '安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh) return A1, A2, A3, A4, A5, A6, A7, 30 elif actual_zhuanye[0] == '园林': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh) return A1, A2, A3, A4, A5, A6, A7, 40 elif actual_zhuanye[0] == '修缮工程土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh) return A1, A2, A3, A4, A5, A6, A7, 50 elif actual_zhuanye[0] == '修缮工程安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh) return A1, A2, A3, A4, A5, A6, A7, 60 else: for i in range(len(parents)): mcs[i] = parents[i] + ' ' + mcs[i] mcs.append(mc) #print(mcs) embeddings = model.encode(mcs) similarities = model.similarity( embeddings[-1], embeddings) ##array([1.0000002, 0.7662151, 1.0000002], dtype=float32) ##print(similarities) sim = similarities[0].numpy()[:-1] ##print(sim) index = np.argmax(sim).item() ##print(index) ##print(actual_zhuanye) actual_zhuanye[0] = actual_zhuanye[index] if actual_zhuanye[0] == '土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh) return A1, A2, A3, A4, A5, A6, A7, 10 elif actual_zhuanye[0] == '市政': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh) return A1, A2, A3, A4, A5, A6, A7, 20 elif actual_zhuanye[0] == '安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh) return A1, A2, A3, A4, A5, A6, A7, 30 elif actual_zhuanye[0] == '园林': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh) return A1, A2, A3, A4, A5, A6, A7, 40 elif actual_zhuanye[0] == '修缮工程土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh) return A1, A2, A3, A4, A5, A6, A7, 50 elif actual_zhuanye[0] == '修缮工程安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh) return A1, A2, A3, A4, A5, A6, A7, 60 def clean(debh): if "附注" in debh: position = debh.find("附注") debh = debh[:position] if "*" in debh: position = debh.find("*") debh = debh[:position] if debh.endswith("换"): debh = debh[:-1] if "[" in debh: position = debh.find("[") debh = debh[:position-1] return debh def handlebh(bh): after = [] for entry in bh: if entry >= '0' and entry <= '9': after.append(entry) if len(after) == 12: return ''.join(after) if len(after) == 11: return '0' + ''.join(after) return '' def main(): def func1(input): cleaned = clean(input['定额编号']) A1, A2, rg, jx, cl, A6, A7, zhuanye = getSingleDeXilie_(model, client2, 10, cleaned, input['工作内容1']) ##print(json.loads(A1)) ##print(zhuanye) return zhuanye print("start") files = ["/Users/xiaopengzhang/Downloads/incremental_data.csv"] for entry in files: if entry.endswith("csv"): print(entry) a = pandas.read_csv( entry, dtype="str") b = a.fillna({'定额编号':''}) c = b[b['项目编号'].notnull()] c["bianhao"] = c.apply(lambda x: handlebh(x['项目编号']), axis=1) d = c[c['bianhao'].notnull()] e = d[d['bianhao'].str.len() > 0] e['label'] = e.apply(lambda x: func1(x), axis=1) e['hash'] = hashlib.sha256(entry.encode()).hexdigest() e['level1'] = e.apply(lambda x: x['bianhao'][0:2], axis=1) e['level2'] = e.apply(lambda x: x['bianhao'][0:4], axis=1) e['level3'] = e.apply(lambda x: x['bianhao'][0:6], axis=1) e['level4'] = e.apply(lambda x: x['bianhao'][0:9], axis = 1) e['parent'] = e.apply(lambda x: getParent(client2, x['label'], clean(x['定额编号'])), axis = 1) e['answer'] = e.apply(lambda x: getLabel(client2, x['label'], clean(x['定额编号'])), axis = 1) e.to_csv("incremental_data.csv") main()