xiaopzhang
/
qingdan


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
							import pandas
import json
import numpy as np
import asyncio
from subdir import db as dbservice
from subdir import service
import os
import hashlib
from sentence_transformers  import SentenceTransformer
model = SentenceTransformer("BAAI/bge-small-zh-v1.5")
from pymongo import MongoClient

client2 = MongoClient()


def getParent(client, zhuanye, debh):
    db = client["dinge"]
    collection = db["de-collection"]
    if zhuanye == 0:
        return ""
    if zhuanye == 10:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
            return post['parent']
    if zhuanye == 20:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
            return post['parent']
    if zhuanye == 30:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
            return post['parent']
    if zhuanye == 40:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
            return post['parent']
    if zhuanye == 50:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
            return post['parent']
    if zhuanye == 60:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
            return post['parent']
    return ""

def getLabel(client, zhuanye, debh):
    db = client["dinge"]
    collection = db["de-collection"]
    if zhuanye == 0:
        return ""
    if zhuanye == 10:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
            return post['parent'] + " " + post['GCLMC']
    if zhuanye == 20:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
            return post['parent'] + " " + post['GCLMC']
    if zhuanye == 30:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
            return post['parent'] + " " + post['GCLMC']
    if zhuanye == 40:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
            return post['parent'] + " " + post['GCLMC']
    if zhuanye == 50:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
            return post['parent'] + " " + post['GCLMC']
    if zhuanye == 60:
        for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
            return post['parent'] + " " + post['GCLMC']
    return ""


def getSingleDeXilie_(model, client, zhuanye, debh, mc):
    db = client["dinge"]
    collection = db["de-collection"]
    count = 0
    actual_zhuanye = []
    mcs = []
    parents = []
    if "附注" in debh:
        position = debh.find("附注")
        debh = debh[:position]
    if "*" in debh:
        position = debh.find("*")
        debh = debh[:position]
    if debh.endswith("换"):
        debh = debh[:-1]
    if "[" in debh:
        position = debh.find("[")
        debh = debh[:position-1]
    for post in collection.find({'DEBH': debh}):
        ##print(post)
        actual_zhuanye.append(post['zhuanye'])
        mcs.append(post['GCLMC'])
        parents.append(post['parent'])
        count = count + 1
    if count == 0:
        return  None, None, None, None, None, None, None, 0
    elif count == 1:
        if actual_zhuanye[0] == '土建':
            A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
            return A1, A2, A3, A4, A5, A6, A7, 10
        elif actual_zhuanye[0] == '市政':
            A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
            return A1, A2, A3, A4, A5, A6, A7, 20
        elif actual_zhuanye[0] == '安装':
            A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
            return A1, A2, A3, A4, A5, A6, A7, 30
        elif actual_zhuanye[0] == '园林':
            A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
            return A1, A2, A3, A4, A5, A6, A7,  40  
        elif actual_zhuanye[0] == '修缮工程土建':
            A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
            return A1, A2, A3, A4, A5, A6, A7, 50   
        elif actual_zhuanye[0] == '修缮工程安装':
            A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
            return A1, A2, A3, A4, A5, A6, A7, 60   
    else:
        hit = False
        for i in range(0, count):
            if mcs[i] in mc:
                hit = True
                actual_zhuanye[0] = actual_zhuanye[i]
        if hit:
            if actual_zhuanye[0] == '土建':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
                return A1, A2, A3, A4, A5, A6, A7, 10
            elif actual_zhuanye[0] == '市政':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
                return A1, A2, A3, A4, A5, A6, A7, 20
            elif actual_zhuanye[0] == '安装':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
                return A1, A2, A3, A4, A5, A6, A7, 30
            elif actual_zhuanye[0] == '园林':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
                return A1, A2, A3, A4, A5, A6, A7,  40  
            elif actual_zhuanye[0] == '修缮工程土建':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
                return A1, A2, A3, A4, A5, A6, A7, 50   
            elif actual_zhuanye[0] == '修缮工程安装':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
                return A1, A2, A3, A4, A5, A6, A7, 60   
        else:
            for i in range(len(parents)):
                mcs[i] = parents[i] + ' ' + mcs[i]
            mcs.append(mc)
            #print(mcs)
            embeddings = model.encode(mcs)
            similarities = model.similarity( embeddings[-1], embeddings)
            ##array([1.0000002, 0.7662151, 1.0000002], dtype=float32)
            ##print(similarities)
            sim = similarities[0].numpy()[:-1]
            ##print(sim)
            index = np.argmax(sim).item()
            ##print(index)
            ##print(actual_zhuanye)

            actual_zhuanye[0] = actual_zhuanye[index]
            if actual_zhuanye[0] == '土建':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
                return A1, A2, A3, A4, A5, A6, A7, 10
            elif actual_zhuanye[0] == '市政':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
                return A1, A2, A3, A4, A5, A6, A7, 20
            elif actual_zhuanye[0] == '安装':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
                return A1, A2, A3, A4, A5, A6, A7, 30
            elif actual_zhuanye[0] == '园林':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
                return A1, A2, A3, A4, A5, A6, A7,  40  
            elif actual_zhuanye[0] == '修缮工程土建':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
                return A1, A2, A3, A4, A5, A6, A7, 50   
            elif actual_zhuanye[0] == '修缮工程安装':
                A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
                return A1, A2, A3, A4, A5, A6, A7, 60   
def clean(debh):
    
    if "附注" in debh:
        position = debh.find("附注")
        debh = debh[:position]
    if "*" in debh:
        position = debh.find("*")
        debh = debh[:position]
    if debh.endswith("换"):
        debh = debh[:-1]
    if "[" in debh:
        position = debh.find("[")
        debh = debh[:position-1]
    return debh

def handlebh(bh):
    after = []
    for entry in bh:
        if entry >= '0' and entry <= '9':
            after.append(entry)
    if len(after) == 12:
        return ''.join(after)
     
    if len(after) == 11:
        return '0' + ''.join(after)
    return ''
def main():
    def func1(input):
        cleaned = clean(input['定额编号'])
        A1, A2, rg, jx, cl, A6, A7, zhuanye = getSingleDeXilie_(model, client2, 10, cleaned,  input['工作内容1'])
        ##print(json.loads(A1))
        ##print(zhuanye)
        return zhuanye
    print("start")
    files = ["/Users/xiaopengzhang/Downloads/incremental_data.csv"]
    for entry in files:
        if entry.endswith("csv"):
            print(entry)
            a = pandas.read_csv( entry, dtype="str")
            b = a.fillna({'定额编号':''})
            c = b[b['项目编号'].notnull()]
            
            c["bianhao"] = c.apply(lambda x: handlebh(x['项目编号']), axis=1)
            d = c[c['bianhao'].notnull()]
            e = d[d['bianhao'].str.len() > 0]
            
            e['label'] = e.apply(lambda x: func1(x), axis=1)
            e['hash'] = hashlib.sha256(entry.encode()).hexdigest()
            e['level1'] = e.apply(lambda x: x['bianhao'][0:2], axis=1)
            e['level2'] = e.apply(lambda x: x['bianhao'][0:4], axis=1)
            e['level3'] = e.apply(lambda x: x['bianhao'][0:6], axis=1)
            e['level4'] = e.apply(lambda x: x['bianhao'][0:9], axis = 1)
            e['parent'] = e.apply(lambda x: getParent(client2, x['label'], clean(x['定额编号'])), axis = 1)
            e['answer'] = e.apply(lambda x: getLabel(client2, x['label'], clean(x['定额编号'])), axis = 1)
            e.to_csv("incremental_data.csv")

main()