data_process.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. import pandas
  2. import json
  3. import numpy as np
  4. import asyncio
  5. from subdir import db as dbservice
  6. from subdir import service
  7. import os
  8. import hashlib
  9. from sentence_transformers import SentenceTransformer
  10. model = SentenceTransformer("BAAI/bge-small-zh-v1.5")
  11. from pymongo import MongoClient
  12. client2 = MongoClient()
  13. def getParent(client, zhuanye, debh):
  14. db = client["dinge"]
  15. collection = db["de-collection"]
  16. if zhuanye == 0:
  17. return ""
  18. if zhuanye == 10:
  19. for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
  20. return post['parent']
  21. if zhuanye == 20:
  22. for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
  23. return post['parent']
  24. if zhuanye == 30:
  25. for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
  26. return post['parent']
  27. if zhuanye == 40:
  28. for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
  29. return post['parent']
  30. if zhuanye == 50:
  31. for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
  32. return post['parent']
  33. if zhuanye == 60:
  34. for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
  35. return post['parent']
  36. return ""
  37. def getLabel(client, zhuanye, debh):
  38. db = client["dinge"]
  39. collection = db["de-collection"]
  40. if zhuanye == 0:
  41. return ""
  42. if zhuanye == 10:
  43. for post in collection.find({'DEBH': debh, 'zhuanye' : '土建' }):
  44. return post['parent'] + " " + post['GCLMC']
  45. if zhuanye == 20:
  46. for post in collection.find({'DEBH': debh, 'zhuanye' : '市政' }):
  47. return post['parent'] + " " + post['GCLMC']
  48. if zhuanye == 30:
  49. for post in collection.find({'DEBH': debh, 'zhuanye' : '安装' }):
  50. return post['parent'] + " " + post['GCLMC']
  51. if zhuanye == 40:
  52. for post in collection.find({'DEBH': debh, 'zhuanye' : '园林' }):
  53. return post['parent'] + " " + post['GCLMC']
  54. if zhuanye == 50:
  55. for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程土建' }):
  56. return post['parent'] + " " + post['GCLMC']
  57. if zhuanye == 60:
  58. for post in collection.find({'DEBH': debh, 'zhuanye' : '修缮工程安装' }):
  59. return post['parent'] + " " + post['GCLMC']
  60. return ""
  61. def getSingleDeXilie_(model, client, zhuanye, debh, mc):
  62. db = client["dinge"]
  63. collection = db["de-collection"]
  64. count = 0
  65. actual_zhuanye = []
  66. mcs = []
  67. parents = []
  68. if "附注" in debh:
  69. position = debh.find("附注")
  70. debh = debh[:position]
  71. if "*" in debh:
  72. position = debh.find("*")
  73. debh = debh[:position]
  74. if debh.endswith("换"):
  75. debh = debh[:-1]
  76. if "[" in debh:
  77. position = debh.find("[")
  78. debh = debh[:position-1]
  79. for post in collection.find({'DEBH': debh}):
  80. ##print(post)
  81. actual_zhuanye.append(post['zhuanye'])
  82. mcs.append(post['GCLMC'])
  83. parents.append(post['parent'])
  84. count = count + 1
  85. if count == 0:
  86. return None, None, None, None, None, None, None, 0
  87. elif count == 1:
  88. if actual_zhuanye[0] == '土建':
  89. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
  90. return A1, A2, A3, A4, A5, A6, A7, 10
  91. elif actual_zhuanye[0] == '市政':
  92. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
  93. return A1, A2, A3, A4, A5, A6, A7, 20
  94. elif actual_zhuanye[0] == '安装':
  95. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
  96. return A1, A2, A3, A4, A5, A6, A7, 30
  97. elif actual_zhuanye[0] == '园林':
  98. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
  99. return A1, A2, A3, A4, A5, A6, A7, 40
  100. elif actual_zhuanye[0] == '修缮工程土建':
  101. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
  102. return A1, A2, A3, A4, A5, A6, A7, 50
  103. elif actual_zhuanye[0] == '修缮工程安装':
  104. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
  105. return A1, A2, A3, A4, A5, A6, A7, 60
  106. else:
  107. hit = False
  108. for i in range(0, count):
  109. if mcs[i] in mc:
  110. hit = True
  111. actual_zhuanye[0] = actual_zhuanye[i]
  112. if hit:
  113. if actual_zhuanye[0] == '土建':
  114. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
  115. return A1, A2, A3, A4, A5, A6, A7, 10
  116. elif actual_zhuanye[0] == '市政':
  117. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
  118. return A1, A2, A3, A4, A5, A6, A7, 20
  119. elif actual_zhuanye[0] == '安装':
  120. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
  121. return A1, A2, A3, A4, A5, A6, A7, 30
  122. elif actual_zhuanye[0] == '园林':
  123. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
  124. return A1, A2, A3, A4, A5, A6, A7, 40
  125. elif actual_zhuanye[0] == '修缮工程土建':
  126. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
  127. return A1, A2, A3, A4, A5, A6, A7, 50
  128. elif actual_zhuanye[0] == '修缮工程安装':
  129. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
  130. return A1, A2, A3, A4, A5, A6, A7, 60
  131. else:
  132. for i in range(len(parents)):
  133. mcs[i] = parents[i] + ' ' + mcs[i]
  134. mcs.append(mc)
  135. #print(mcs)
  136. embeddings = model.encode(mcs)
  137. similarities = model.similarity( embeddings[-1], embeddings)
  138. ##array([1.0000002, 0.7662151, 1.0000002], dtype=float32)
  139. ##print(similarities)
  140. sim = similarities[0].numpy()[:-1]
  141. ##print(sim)
  142. index = np.argmax(sim).item()
  143. ##print(index)
  144. ##print(actual_zhuanye)
  145. actual_zhuanye[0] = actual_zhuanye[index]
  146. if actual_zhuanye[0] == '土建':
  147. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_tj(debh)
  148. return A1, A2, A3, A4, A5, A6, A7, 10
  149. elif actual_zhuanye[0] == '市政':
  150. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_sz(debh)
  151. return A1, A2, A3, A4, A5, A6, A7, 20
  152. elif actual_zhuanye[0] == '安装':
  153. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_az(debh)
  154. return A1, A2, A3, A4, A5, A6, A7, 30
  155. elif actual_zhuanye[0] == '园林':
  156. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_yl(debh)
  157. return A1, A2, A3, A4, A5, A6, A7, 40
  158. elif actual_zhuanye[0] == '修缮工程土建':
  159. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xstj(debh)
  160. return A1, A2, A3, A4, A5, A6, A7, 50
  161. elif actual_zhuanye[0] == '修缮工程安装':
  162. A1, A2, A3, A4, A5, A6, A7 = service.getSingleDeXilie_xsaz(debh)
  163. return A1, A2, A3, A4, A5, A6, A7, 60
  164. def clean(debh):
  165. if "附注" in debh:
  166. position = debh.find("附注")
  167. debh = debh[:position]
  168. if "*" in debh:
  169. position = debh.find("*")
  170. debh = debh[:position]
  171. if debh.endswith("换"):
  172. debh = debh[:-1]
  173. if "[" in debh:
  174. position = debh.find("[")
  175. debh = debh[:position-1]
  176. return debh
  177. def handlebh(bh):
  178. after = []
  179. for entry in bh:
  180. if entry >= '0' and entry <= '9':
  181. after.append(entry)
  182. if len(after) == 12:
  183. return ''.join(after)
  184. if len(after) == 11:
  185. return '0' + ''.join(after)
  186. return ''
  187. def main():
  188. def func1(input):
  189. cleaned = clean(input['定额编号'])
  190. A1, A2, rg, jx, cl, A6, A7, zhuanye = getSingleDeXilie_(model, client2, 10, cleaned, input['工作内容1'])
  191. ##print(json.loads(A1))
  192. ##print(zhuanye)
  193. return zhuanye
  194. print("start")
  195. files = os.listdir("/Users/xiaopengzhang/Develop/data/wang")
  196. for entry in files:
  197. if entry.endswith("csv"):
  198. print(entry)
  199. a = pandas.read_csv("/Users/xiaopengzhang/Develop/data/wang/" + entry, dtype="str")
  200. b = a.fillna({'定额编号':''})
  201. c = b[b['项目编号'].notnull()]
  202. c["bianhao"] = c.apply(lambda x: handlebh(x['项目编号']), axis=1)
  203. d = c[c['bianhao'].notnull()]
  204. e = d[d['bianhao'].str.len() > 0]
  205. e['label'] = e.apply(lambda x: func1(x), axis=1)
  206. e['hash'] = hashlib.sha256(entry.encode()).hexdigest()
  207. e['level1'] = e.apply(lambda x: x['bianhao'][0:2], axis=1)
  208. e['level2'] = e.apply(lambda x: x['bianhao'][0:4], axis=1)
  209. e['level3'] = e.apply(lambda x: x['bianhao'][0:6], axis=1)
  210. e['level4'] = e.apply(lambda x: x['bianhao'][0:9], axis = 1)
  211. e['parent'] = e.apply(lambda x: getParent(client2, x['label'], clean(x['定额编号'])), axis = 1)
  212. e['answer'] = e.apply(lambda x: getLabel(client2, x['label'], clean(x['定额编号'])), axis = 1)
  213. e.to_csv(hashlib.sha256(entry.encode()).hexdigest() + ".hash.answer.csv")
  214. main()