概述
语音识别是当前人工智能的比较热门的方向,技术也比较成熟,各大公司也相继推出了各自的语音助手机器人,如百度的小度机器人、阿里的天猫精灵等。语音识别算法当前主要是由rnn、lstm、dnn-hmm等机器学习和深度学习技术做支撑。但训练这些模型的第一步就是将音频文件数据化,提取当中的语音特征。
mp3文件转化为wav文件
录制音频文件的软件大多数都是以mp3格式输出的,但mp3格式文件对语音的压缩比例较重,因此首先利用ffmpeg将转化为wav原始文件有利于语音特征的提取。其转化代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
|
from pydub import audiosegment import pydub def mp32wav(mp3_path,wav_path): """ 这是mp3文件转化成wav文件的函数 :param mp3_path: mp3文件的地址 :param wav_path: wav文件的地址 """ pydub.audiosegment.converter = "d:\\ffmpeg\\bin\\ffmpeg.exe" mp3_file = audiosegment.from_mp3( file = mp3_path) mp3_file.export(wav_path, format = "wav" ) |
读取wav语音文件,对语音进行采样
利用wave库对语音文件进行采样。
代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
import wave import json def read_wav(wav_path): """ 这是读取wav文件的函数,音频数据是单通道的。返回json :param wav_path: wav文件的地址 """ wav_file = wave. open (wav_path, 'r' ) numchannel = wav_file.getnchannels() # 声道数 samplewidth = wav_file.getsampwidth() # 量化位数 framerate = wav_file.getframerate() # 采样频率 numframes = wav_file.getnframes() # 采样点数 print ( "channel" , numchannel) print ( "sample_width" , samplewidth) print ( "framerate" , framerate) print ( "numframes" , numframes) wav_data = wav_file.readframes(numframes) wav_data = np.fromstring(wav_data,dtype = np.int16) wav_data = wav_data * 1.0 / ( max ( abs (wav_data))) #对数据进行归一化 # 生成音频数据,ndarray不能进行json化,必须转化为list,生成json dict = { "channel" :numchannel, "samplewidth" :samplewidth, "framerate" :framerate, "numframes" :numframes, "wavedata" : list (wav_data)} return json.dumps( dict ) |
绘制声波折线图与频谱图
代码如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
from matplotlib import pyplot as plt def drawspectrum(wav_data,framerate): """ 这是画音频的频谱函数 :param wav_data: 音频数据 :param framerate: 采样频率 """ time = np.linspace( 0 , len (wav_data) / framerate * 1.0 ,num = len (wav_data)) plt.figure( 1 ) plt.plot(time,wav_data) plt.grid(true) plt.show() plt.figure( 2 ) pxx, freqs, bins, im = plt.specgram(wav_data,nfft = 1024 ,fs = 16000 ,noverlap = 900 ) plt.show() print (pxx) print (freqs) print (bins) print (im) |
首先利用百度ai开发平台的语音合api生成的mp3文件进行上述过程的结果。
声波折线图
频谱图
全部代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
#!/usr/bin/python3 # -*- coding: utf-8 -*- # @time : 2018/7/5 13:11 # @author : daipuwei # @filename: voiceextract.py # @software: pycharm # @e-mail :771830171@qq.com # @blog :https://blog.csdn.net/qq_30091945 import numpy as np from pydub import audiosegment import pydub import os import wave import json from matplotlib import pyplot as plt def mp32wav(mp3_path,wav_path): """ 这是mp3文件转化成wav文件的函数 :param mp3_path: mp3文件的地址 :param wav_path: wav文件的地址 """ pydub.audiosegment.converter = "d:\\ffmpeg\\bin\\ffmpeg.exe" #说明ffmpeg的地址 mp3_file = audiosegment.from_mp3( file = mp3_path) mp3_file.export(wav_path, format = "wav" ) def read_wav(wav_path): """ 这是读取wav文件的函数,音频数据是单通道的。返回json :param wav_path: wav文件的地址 """ wav_file = wave. open (wav_path, 'r' ) numchannel = wav_file.getnchannels() # 声道数 samplewidth = wav_file.getsampwidth() # 量化位数 framerate = wav_file.getframerate() # 采样频率 numframes = wav_file.getnframes() # 采样点数 print ( "channel" , numchannel) print ( "sample_width" , samplewidth) print ( "framerate" , framerate) print ( "numframes" , numframes) wav_data = wav_file.readframes(numframes) wav_data = np.fromstring(wav_data,dtype = np.int16) wav_data = wav_data * 1.0 / ( max ( abs (wav_data))) #对数据进行归一化 # 生成音频数据,ndarray不能进行json化,必须转化为list,生成json dict = { "channel" :numchannel, "samplewidth" :samplewidth, "framerate" :framerate, "numframes" :numframes, "wavedata" : list (wav_data)} return json.dumps( dict ) def drawspectrum(wav_data,framerate): """ 这是画音频的频谱函数 :param wav_data: 音频数据 :param framerate: 采样频率 """ time = np.linspace( 0 , len (wav_data) / framerate * 1.0 ,num = len (wav_data)) plt.figure( 1 ) plt.plot(time,wav_data) plt.grid(true) plt.show() plt.figure( 2 ) pxx, freqs, bins, im = plt.specgram(wav_data,nfft = 1024 ,fs = 16000 ,noverlap = 900 ) plt.show() print (pxx) print (freqs) print (bins) print (im) def run_main(): """ 这是主函数 """ # mp3文件和wav文件的地址 path1 = './mp3_file' path2 = "./wav_file" paths = os.listdir(path1) mp3_paths = [] # 获取mp3文件的相对地址 for mp3_path in paths: mp3_paths.append(path1 + "/" + mp3_path) print (mp3_paths) # 得到mp3文件对应的wav文件的相对地址 wav_paths = [] for mp3_path in mp3_paths: wav_path = path2 + "/" + mp3_path[ 1 :].split( '.' )[ 0 ].split( '/' )[ - 1 ] + '.wav' wav_paths.append(wav_path) print (wav_paths) # 将mp3文件转化成wav文件 for (mp3_path,wav_path) in zip (mp3_paths,wav_paths): mp32wav(mp3_path,wav_path) for wav_path in wav_paths: read_wav(wav_path) # 开始对音频文件进行数据化 for wav_path in wav_paths: wav_json = read_wav(wav_path) print (wav_json) wav = json.loads(wav_json) wav_data = np.array(wav[ 'wavedata' ]) framerate = int (wav[ 'framerate' ]) drawspectrum(wav_data,framerate) if __name__ = = '__main__' : run_main() |
以上这篇使用python实现语音文件的特征提取方法就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/qq_30091945/article/details/80941820