data_validate.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. import pandas
  2. import json
  3. import numpy as np
  4. import asyncio
  5. from subdir import db as dbservice
  6. from subdir import service
  7. import os
  8. import hashlib
  9. from sentence_transformers import SentenceTransformer
  10. model = SentenceTransformer("BAAI/bge-small-zh-v1.5")
  11. from pymongo import MongoClient
  12. client = MongoClient()
  13. client2 = MongoClient()
  14. def func1(input, content):
  15. cleaned = clean(input)
  16. A1, A2, rg, jx, cl, A6, A7, zhuanye = getSingleDeXilie_(model, client2, 10, cleaned, content)
  17. ##print(json.loads(A1))
  18. ##print(zhuanye)
  19. return zhuanye
  20. def getSingleDeXilie_(model, client, zhuanye, debh, mc):
  21. db = client["dinge"]
  22. collection = db["de-collection"]
  23. count = 0
  24. actual_zhuanye = []
  25. mcs = []
  26. parents = []
  27. if "附注" in debh:
  28. position = debh.find("附注")
  29. debh = debh[:position]
  30. if "*" in debh:
  31. position = debh.find("*")
  32. debh = debh[:position]
  33. if debh.endswith("换"):
  34. debh = debh[:-1]
  35. if "[" in debh:
  36. position = debh.find("[")
  37. debh = debh[:position-1]
  38. for post in collection.find({'DEBH': debh}):
  39. ##print(post)
  40. actual_zhuanye.append(post['zhuanye'])
  41. mcs.append(post['GCLMC'])
  42. parents.append(post['parent'])
  43. count = count + 1
  44. if count == 0:
  45. return None, None, None, None, None, None, None, 0
  46. elif count == 1:
  47. if actual_zhuanye[0] == '土建':
  48. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
  49. return A1, A2, A3, A4, A5, A6, A7, 10
  50. elif actual_zhuanye[0] == '市政':
  51. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
  52. return A1, A2, A3, A4, A5, A6, A7, 20
  53. elif actual_zhuanye[0] == '安装':
  54. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
  55. return A1, A2, A3, A4, A5, A6, A7, 30
  56. elif actual_zhuanye[0] == '园林':
  57. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
  58. return A1, A2, A3, A4, A5, A6, A7, 40
  59. elif actual_zhuanye[0] == '修缮工程土建':
  60. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
  61. return A1, A2, A3, A4, A5, A6, A7, 50
  62. elif actual_zhuanye[0] == '修缮工程安装':
  63. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
  64. return A1, A2, A3, A4, A5, A6, A7, 60
  65. else:
  66. hit = False
  67. for i in range(0, count):
  68. if mcs[i] in mc:
  69. hit = True
  70. actual_zhuanye[0] = actual_zhuanye[i]
  71. if hit:
  72. if actual_zhuanye[0] == '土建':
  73. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
  74. return A1, A2, A3, A4, A5, A6, A7, 10
  75. elif actual_zhuanye[0] == '市政':
  76. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
  77. return A1, A2, A3, A4, A5, A6, A7, 20
  78. elif actual_zhuanye[0] == '安装':
  79. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
  80. return A1, A2, A3, A4, A5, A6, A7, 30
  81. elif actual_zhuanye[0] == '园林':
  82. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
  83. return A1, A2, A3, A4, A5, A6, A7, 40
  84. elif actual_zhuanye[0] == '修缮工程土建':
  85. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
  86. return A1, A2, A3, A4, A5, A6, A7, 50
  87. elif actual_zhuanye[0] == '修缮工程安装':
  88. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
  89. return A1, A2, A3, A4, A5, A6, A7, 60
  90. else:
  91. for i in range(len(parents)):
  92. mcs[i] = parents[i] + ' ' + mcs[i]
  93. mcs.append(mc)
  94. #print(mcs)
  95. embeddings = model.encode(mcs)
  96. similarities = model.similarity( embeddings[-1], embeddings)
  97. ##array([1.0000002, 0.7662151, 1.0000002], dtype=float32)
  98. ##print(similarities)
  99. sim = similarities[0].numpy()[:-1]
  100. ##print(sim)
  101. index = np.argmax(sim).item()
  102. ##print(index)
  103. ##print(actual_zhuanye)
  104. actual_zhuanye[0] = actual_zhuanye[index]
  105. if actual_zhuanye[0] == '土建':
  106. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
  107. return A1, A2, A3, A4, A5, A6, A7, 10
  108. elif actual_zhuanye[0] == '市政':
  109. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
  110. return A1, A2, A3, A4, A5, A6, A7, 20
  111. elif actual_zhuanye[0] == '安装':
  112. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
  113. return A1, A2, A3, A4, A5, A6, A7, 30
  114. elif actual_zhuanye[0] == '园林':
  115. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
  116. return A1, A2, A3, A4, A5, A6, A7, 40
  117. elif actual_zhuanye[0] == '修缮工程土建':
  118. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
  119. return A1, A2, A3, A4, A5, A6, A7, 50
  120. elif actual_zhuanye[0] == '修缮工程安装':
  121. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
  122. return A1, A2, A3, A4, A5, A6, A7, 60
  123. def clean(debh):
  124. if "附注" in debh:
  125. position = debh.find("附注")
  126. debh = debh[:position]
  127. if "*" in debh:
  128. position = debh.find("*")
  129. debh = debh[:position]
  130. if debh.endswith("换"):
  131. debh = debh[:-1]
  132. if "[" in debh:
  133. position = debh.find("[")
  134. debh = debh[:position-1]
  135. return debh
  136. def getLabel(client, zhuanye, debh):
  137. db = client["dinge"]
  138. collection = db["de-collection"]
  139. if zhuanye == 0:
  140. return ""
  141. if zhuanye == 10:
  142. for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
  143. return post['parent'] + " " + post['GCLMC']
  144. if zhuanye == 20:
  145. for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
  146. return post['parent'] + " " + post['GCLMC']
  147. if zhuanye == 30:
  148. for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
  149. return post['parent'] + " " + post['GCLMC']
  150. if zhuanye == 40:
  151. for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
  152. return post['parent'] + " " + post['GCLMC']
  153. if zhuanye == 50:
  154. for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
  155. return post['parent'] + " " + post['GCLMC']
  156. if zhuanye == 60:
  157. for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
  158. return post['parent'] + " " + post['GCLMC']
  159. return ""
  160. db = client["baojia"]
  161. collection = db["qdxm"]
  162. dfs = []
  163. for post in collection.find({'biao_id': "68f1f8fbc93ecc425087f4e8"}):
  164. print(post)
  165. for entry in post["__children"]:
  166. zhuanye = func1(entry["清单编码"], entry["名称"])
  167. label = getLabel(client2, zhuanye, clean(entry["清单编码"]))
  168. item = {
  169. '名称' : [post["名称"]],
  170. "项目特征" : [post["项目特征"]],
  171. "清单编码" : [post["清单编码"]],
  172. "定额编码" : [entry["清单编码"]],
  173. "定额名称" : [entry["名称"]],
  174. "label" : label,
  175. "zhuanye" : zhuanye
  176. }
  177. df = pandas.DataFrame.from_dict(item)
  178. dfs.append(df)
  179. data = pandas.concat(dfs)
  180. data.to_csv("validate1.csv")