| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- import pandas
- import json
- import numpy as np
- import asyncio
- from subdir import db as dbservice
- from subdir import service
- import os
- import hashlib
- from sentence_transformers import SentenceTransformer
- model = SentenceTransformer("BAAI/bge-small-zh-v1.5")
- from pymongo import MongoClient
- client2 = MongoClient()
- def getParent(client, zhuanye, debh):
- db = client["dinge"]
- collection = db["de-collection"]
- if zhuanye == 0:
- return ""
- if zhuanye == 10:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
- return post['parent']
- if zhuanye == 20:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
- return post['parent']
- if zhuanye == 30:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
- return post['parent']
- if zhuanye == 40:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
- return post['parent']
- if zhuanye == 50:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
- return post['parent']
- if zhuanye == 60:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
- return post['parent']
- return ""
- def getLabel(client, zhuanye, debh):
- db = client["dinge"]
- collection = db["de-collection"]
- if zhuanye == 0:
- return ""
- if zhuanye == 10:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
- return post['parent'] + " " + post['GCLMC']
- if zhuanye == 20:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
- return post['parent'] + " " + post['GCLMC']
- if zhuanye == 30:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
- return post['parent'] + " " + post['GCLMC']
- if zhuanye == 40:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
- return post['parent'] + " " + post['GCLMC']
- if zhuanye == 50:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
- return post['parent'] + " " + post['GCLMC']
- if zhuanye == 60:
- for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
- return post['parent'] + " " + post['GCLMC']
- return ""
- def getSingleDeXilie_(model, client, zhuanye, debh, mc):
- db = client["dinge"]
- collection = db["de-collection"]
- count = 0
- actual_zhuanye = []
- mcs = []
- parents = []
- if "附注" in debh:
- position = debh.find("附注")
- debh = debh[:position]
- if "*" in debh:
- position = debh.find("*")
- debh = debh[:position]
- if debh.endswith("换"):
- debh = debh[:-1]
- if "[" in debh:
- position = debh.find("[")
- debh = debh[:position-1]
- for post in collection.find({'DEBH': debh}):
- ##print(post)
- actual_zhuanye.append(post['zhuanye'])
- mcs.append(post['GCLMC'])
- parents.append(post['parent'])
- count = count + 1
- if count == 0:
- return None, None, None, None, None, None, None, 0
- elif count == 1:
- if actual_zhuanye[0] == '土建':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
- return A1, A2, A3, A4, A5, A6, A7, 10
- elif actual_zhuanye[0] == '市政':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
- return A1, A2, A3, A4, A5, A6, A7, 20
- elif actual_zhuanye[0] == '安装':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
- return A1, A2, A3, A4, A5, A6, A7, 30
- elif actual_zhuanye[0] == '园林':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
- return A1, A2, A3, A4, A5, A6, A7, 40
- elif actual_zhuanye[0] == '修缮工程土建':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
- return A1, A2, A3, A4, A5, A6, A7, 50
- elif actual_zhuanye[0] == '修缮工程安装':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
- return A1, A2, A3, A4, A5, A6, A7, 60
- else:
- hit = False
- for i in range(0, count):
- if mcs[i] in mc:
- hit = True
- actual_zhuanye[0] = actual_zhuanye[i]
- if hit:
- if actual_zhuanye[0] == '土建':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
- return A1, A2, A3, A4, A5, A6, A7, 10
- elif actual_zhuanye[0] == '市政':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
- return A1, A2, A3, A4, A5, A6, A7, 20
- elif actual_zhuanye[0] == '安装':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
- return A1, A2, A3, A4, A5, A6, A7, 30
- elif actual_zhuanye[0] == '园林':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
- return A1, A2, A3, A4, A5, A6, A7, 40
- elif actual_zhuanye[0] == '修缮工程土建':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
- return A1, A2, A3, A4, A5, A6, A7, 50
- elif actual_zhuanye[0] == '修缮工程安装':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
- return A1, A2, A3, A4, A5, A6, A7, 60
- else:
- for i in range(len(parents)):
- mcs[i] = parents[i] + ' ' + mcs[i]
- mcs.append(mc)
- #print(mcs)
- embeddings = model.encode(mcs)
- similarities = model.similarity( embeddings[-1], embeddings)
- ##array([1.0000002, 0.7662151, 1.0000002], dtype=float32)
- ##print(similarities)
- sim = similarities[0].numpy()[:-1]
- ##print(sim)
- index = np.argmax(sim).item()
- ##print(index)
- ##print(actual_zhuanye)
- actual_zhuanye[0] = actual_zhuanye[index]
- if actual_zhuanye[0] == '土建':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
- return A1, A2, A3, A4, A5, A6, A7, 10
- elif actual_zhuanye[0] == '市政':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
- return A1, A2, A3, A4, A5, A6, A7, 20
- elif actual_zhuanye[0] == '安装':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
- return A1, A2, A3, A4, A5, A6, A7, 30
- elif actual_zhuanye[0] == '园林':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
- return A1, A2, A3, A4, A5, A6, A7, 40
- elif actual_zhuanye[0] == '修缮工程土建':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
- return A1, A2, A3, A4, A5, A6, A7, 50
- elif actual_zhuanye[0] == '修缮工程安装':
- A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
- return A1, A2, A3, A4, A5, A6, A7, 60
- def clean(debh):
-
- if "附注" in debh:
- position = debh.find("附注")
- debh = debh[:position]
- if "*" in debh:
- position = debh.find("*")
- debh = debh[:position]
- if debh.endswith("换"):
- debh = debh[:-1]
- if "[" in debh:
- position = debh.find("[")
- debh = debh[:position-1]
- return debh
- def handlebh(bh):
- after = []
- for entry in bh:
- if entry >= '0' and entry <= '9':
- after.append(entry)
- if len(after) == 12:
- return ''.join(after)
-
- if len(after) == 11:
- return '0' + ''.join(after)
- return ''
- def main():
- def func1(input):
- cleaned = clean(input['定额编号'])
- A1, A2, rg, jx, cl, A6, A7, zhuanye = getSingleDeXilie_(model, client2, 10, cleaned, input['工作内容1'])
- ##print(json.loads(A1))
- ##print(zhuanye)
- return zhuanye
- print("start")
- files = ["/Users/xiaopengzhang/Downloads/incremental_data.csv"]
- for entry in files:
- if entry.endswith("csv"):
- print(entry)
- a = pandas.read_csv( entry, dtype="str")
- b = a.fillna({'定额编号':''})
- c = b[b['项目编号'].notnull()]
-
- c["bianhao"] = c.apply(lambda x: handlebh(x['项目编号']), axis=1)
- d = c[c['bianhao'].notnull()]
- e = d[d['bianhao'].str.len() > 0]
-
- e['label'] = e.apply(lambda x: func1(x), axis=1)
- e['hash'] = hashlib.sha256(entry.encode()).hexdigest()
- e['level1'] = e.apply(lambda x: x['bianhao'][0:2], axis=1)
- e['level2'] = e.apply(lambda x: x['bianhao'][0:4], axis=1)
- e['level3'] = e.apply(lambda x: x['bianhao'][0:6], axis=1)
- e['level4'] = e.apply(lambda x: x['bianhao'][0:9], axis = 1)
- e['parent'] = e.apply(lambda x: getParent(client2, x['label'], clean(x['定额编号'])), axis = 1)
- e['answer'] = e.apply(lambda x: getLabel(client2, x['label'], clean(x['定额编号'])), axis = 1)
- e.to_csv("incremental_data.csv")
- main()
-
-
-
-
|