[月份维度]日志数据提取包含关键词的事件,解析落入的月份计数,matplotlib绘制统计图,python

x33g5p2x  于2022-08-17 转载在 Python  
字(3.2k)|赞(0)|评价(0)|浏览(763)

**[月份维度]**日志数据提取包含关键词的事件,解析落入的月份计数,matplotlib绘制统计图,python

日志数据提取事件关键词,解析对应日期的星期计数,matplotlib绘制统计图,python

https://zhangphil.blog.csdn.net/article/details/125941649

https://zhangphil.blog.csdn.net/article/details/125941649

在此基础上,对代码修改,以月为维度,统计包含关键词事件落入的月份。

  1. from datetime import datetime
  2. from pprint import pp
  3. import pandas as pd
  4. import matplotlib
  5. import matplotlib.pyplot as plt
  6. from fuzzywuzzy import fuzz
  7. import re
  8. FILE_PATH = r'源数据路径'
  9. KEY = r'模糊匹配的关键词' # 关键词1,关键词2
  10. threshold = 80
  11. SECTION = 'section'
  12. SUM = 'sum'
  13. def drawchart(df):
  14. myfont = matplotlib.font_manager.FontProperties(fname='C:\Windows\Fonts\msyh.ttc')
  15. plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
  16. plt.rc('font', family='YaHei', weight='bold')
  17. order = []
  18. name = []
  19. mem = []
  20. for d, i in zip(df.values, df.index):
  21. order.append(i)
  22. name.append(d[0])
  23. mem.append(int(d[1]))
  24. FONT_SIZE = 12
  25. fig, ax = plt.subplots(figsize=(15, 13))
  26. b = ax.barh(y=range(len(name)), width=mem, align='center', color='red')
  27. # 为横向水平的柱图右侧添加数据标签。
  28. i = 0
  29. for rect in b:
  30. w = rect.get_width()
  31. ax.text(x=w, y=rect.get_y() + rect.get_height() / 2, s='%d' % (int(w)),
  32. horizontalalignment='left', verticalalignment='center',
  33. fontproperties=myfont, fontsize=FONT_SIZE - 2, color='green')
  34. ax.text(x=w / 2, y=rect.get_y() + rect.get_height() / 2, s=str(order[i]),
  35. horizontalalignment='center', verticalalignment='center',
  36. fontproperties=myfont, fontsize=FONT_SIZE - 3, color='white')
  37. i = i + 1
  38. ax.set_yticks(range(len(name)))
  39. ax.set_yticklabels(name, fontsize=FONT_SIZE - 1, fontproperties=myfont)
  40. ax.invert_yaxis()
  41. ax.set_xlabel('数据', fontsize=FONT_SIZE + 2, fontproperties=myfont)
  42. ax.set_title('数据点总量排名', fontsize=FONT_SIZE + 3, fontproperties=myfont)
  43. # 不要横坐标上的label标签。
  44. plt.xticks(())
  45. # 清除四周的边框线
  46. ax.get_yaxis().set_visible(True)
  47. for spine in ["left", "top", "right", "bottom"]:
  48. ax.spines[spine].set_visible(False)
  49. plt.subplots_adjust(left=0.15) # 调整左侧边距
  50. # ax.margins(y=0.01) #缩放 zoom in
  51. ax.set_aspect('auto')
  52. plt.show()
  53. def read_file():
  54. file = open(FILE_PATH, 'r', encoding='UTF-8')
  55. all_case_time = []
  56. case_count = 1
  57. cnt = 1
  58. for line in file:
  59. pr = fuzz.partial_ratio(line, KEY)
  60. if pr >= threshold:
  61. print('-----')
  62. print(f'第{case_count}件')
  63. case_count = case_count + 1
  64. try:
  65. # 正则匹配 xxxx年xx月xx日xx时xx分
  66. mat = re.search(r'\d{4}\年\d{1,2}\月\d{1,2}\日\d{1,2}\时\d{1,2}\分', line)
  67. t_str = mat.group().replace('\n', '') # 去掉正则匹配到但是多余的 \n 换行符
  68. try:
  69. object_t = datetime.strptime(t_str, "%Y年%m月%d日%H时%M分")
  70. all_case_time.append(object_t.date()) # 日期提取出来,放到数组中
  71. print(f'{object_t.date().strftime("%Y-%m-%d")} {object_t.weekday()}')
  72. except:
  73. print('解析日期失败')
  74. pass
  75. except:
  76. t_str = '-解析异常-'
  77. pass
  78. s = '第{number}行,相似度{ratio},时间{case_time}\n{content}'
  79. ss = s.format(number=cnt, ratio=pr, case_time=t_str, content=line)
  80. pp(ss)
  81. # 快速调试
  82. # if case_count > 100:
  83. # break
  84. cnt = cnt + 1
  85. file.close()
  86. return all_case_time
  87. def data_frame():
  88. ts = read_file()
  89. times = []
  90. for i in range(12):
  91. times.append({SECTION: i, SUM: 0})
  92. for t in ts:
  93. for tx in times:
  94. if tx[SECTION] == t.month:
  95. tx[SUM] = tx[SUM] + 1
  96. break
  97. return times
  98. def number_to_month(number):
  99. zh = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二']
  100. m = f'{zh[number]}月'
  101. return m
  102. if __name__ == '__main__':
  103. times = data_frame()
  104. # 数据组装成pandas数据帧。
  105. pd_data = []
  106. for t in times:
  107. l = [number_to_month(t[SECTION]), t[SUM]]
  108. pd_data.append(l)
  109. col = ['月份', '次数']
  110. df = pd.DataFrame(data=pd_data, columns=col)
  111. df = df.sort_values(by=col[1], axis=0, ascending=False) # 降序
  112. # 重置索引
  113. df = df.reset_index(drop=True)
  114. df.index = df.index + 1
  115. # 前10名
  116. pp(df.head(20))
  117. # pp(df.values)
  118. drawchart(df)

变换关键词,查找后生成的统计图:

相关文章