import pandas import json import numpy as np import asyncio from subdir import db as dbservice from subdir import service import os import hashlib from sentence_transformers import SentenceTransformer model = SentenceTransformer("BAAI/bge-small-zh-v1.5") from pymongo import MongoClient client = MongoClient() client2 = MongoClient() def func1(input, content): cleaned = clean(input) A1, A2, rg, jx, cl, A6, A7, zhuanye = getSingleDeXilie_(model, client2, 10, cleaned, content) ##print(json.loads(A1)) ##print(zhuanye) return zhuanye def getSingleDeXilie_(model, client, zhuanye, debh, mc): db = client["dinge"] collection = db["de-collection"] count = 0 actual_zhuanye = [] mcs = [] parents = [] if "附注" in debh: position = debh.find("附注") debh = debh[:position] if "*" in debh: position = debh.find("*") debh = debh[:position] if debh.endswith("换"): debh = debh[:-1] if "[" in debh: position = debh.find("[") debh = debh[:position-1] for post in collection.find({'DEBH': debh}): ##print(post) actual_zhuanye.append(post['zhuanye']) mcs.append(post['GCLMC']) parents.append(post['parent']) count = count + 1 if count == 0: return None, None, None, None, None, None, None, 0 elif count == 1: if actual_zhuanye[0] == '土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh) return A1, A2, A3, A4, A5, A6, A7, 10 elif actual_zhuanye[0] == '市政': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh) return A1, A2, A3, A4, A5, A6, A7, 20 elif actual_zhuanye[0] == '安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh) return A1, A2, A3, A4, A5, A6, A7, 30 elif actual_zhuanye[0] == '园林': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh) return A1, A2, A3, A4, A5, A6, A7, 40 elif actual_zhuanye[0] == '修缮工程土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh) return A1, A2, A3, A4, A5, A6, A7, 50 elif actual_zhuanye[0] == '修缮工程安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh) return A1, A2, A3, A4, A5, A6, A7, 60 else: hit = False for i in range(0, count): if mcs[i] in mc: hit = True actual_zhuanye[0] = actual_zhuanye[i] if hit: if actual_zhuanye[0] == '土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh) return A1, A2, A3, A4, A5, A6, A7, 10 elif actual_zhuanye[0] == '市政': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh) return A1, A2, A3, A4, A5, A6, A7, 20 elif actual_zhuanye[0] == '安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh) return A1, A2, A3, A4, A5, A6, A7, 30 elif actual_zhuanye[0] == '园林': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh) return A1, A2, A3, A4, A5, A6, A7, 40 elif actual_zhuanye[0] == '修缮工程土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh) return A1, A2, A3, A4, A5, A6, A7, 50 elif actual_zhuanye[0] == '修缮工程安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh) return A1, A2, A3, A4, A5, A6, A7, 60 else: for i in range(len(parents)): mcs[i] = parents[i] + ' ' + mcs[i] mcs.append(mc) #print(mcs) embeddings = model.encode(mcs) similarities = model.similarity( embeddings[-1], embeddings) ##array([1.0000002, 0.7662151, 1.0000002], dtype=float32) ##print(similarities) sim = similarities[0].numpy()[:-1] ##print(sim) index = np.argmax(sim).item() ##print(index) ##print(actual_zhuanye) actual_zhuanye[0] = actual_zhuanye[index] if actual_zhuanye[0] == '土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh) return A1, A2, A3, A4, A5, A6, A7, 10 elif actual_zhuanye[0] == '市政': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh) return A1, A2, A3, A4, A5, A6, A7, 20 elif actual_zhuanye[0] == '安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh) return A1, A2, A3, A4, A5, A6, A7, 30 elif actual_zhuanye[0] == '园林': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh) return A1, A2, A3, A4, A5, A6, A7, 40 elif actual_zhuanye[0] == '修缮工程土建': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh) return A1, A2, A3, A4, A5, A6, A7, 50 elif actual_zhuanye[0] == '修缮工程安装': A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh) return A1, A2, A3, A4, A5, A6, A7, 60 def clean(debh): if "附注" in debh: position = debh.find("附注") debh = debh[:position] if "*" in debh: position = debh.find("*") debh = debh[:position] if debh.endswith("换"): debh = debh[:-1] if "[" in debh: position = debh.find("[") debh = debh[:position-1] return debh def getLabel(client, zhuanye, debh): db = client["dinge"] collection = db["de-collection"] if zhuanye == 0: return "" if zhuanye == 10: for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 20: for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 30: for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 40: for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 50: for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }): return post['parent'] + " " + post['GCLMC'] if zhuanye == 60: for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }): return post['parent'] + " " + post['GCLMC'] return "" db = client["baojia"] collection = db["qdxm"] dfs = [] for post in collection.find({'biao_id': "68f1f8fbc93ecc425087f4e8"}): print(post) for entry in post["__children"]: zhuanye = func1(entry["清单编码"], entry["名称"]) label = getLabel(client2, zhuanye, clean(entry["清单编码"])) item = { '名称' : [post["名称"]], "项目特征" : [post["项目特征"]], "清单编码" : [post["清单编码"]], "定额编码" : [entry["清单编码"]], "定额名称" : [entry["名称"]], "label" : label, "zhuanye" : zhuanye } df = pandas.DataFrame.from_dict(item) dfs.append(df) data = pandas.concat(dfs) data.to_csv("validate1.csv")