|
| 1 | +// Copyright 2020 The TensorFlow Authors. All Rights Reserved. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | +// ============================================================================== |
| 15 | +// |
| 16 | +// This class has been generated, DO NOT EDIT! |
| 17 | +// |
| 18 | +package org.tensorflow.op; |
| 19 | + |
| 20 | +import org.tensorflow.Operand; |
| 21 | +import org.tensorflow.op.audio.AudioSpectrogram; |
| 22 | +import org.tensorflow.op.audio.DecodeWav; |
| 23 | +import org.tensorflow.op.audio.EncodeWav; |
| 24 | +import org.tensorflow.op.audio.Mfcc; |
| 25 | +import org.tensorflow.types.TFloat32; |
| 26 | +import org.tensorflow.types.TInt32; |
| 27 | +import org.tensorflow.types.TString; |
| 28 | + |
| 29 | +/** |
| 30 | + * An API for building {@code audio} operations as {@link Op Op}s |
| 31 | + * |
| 32 | + * @see {@link Ops} |
| 33 | + */ |
| 34 | +public final class AudioOps { |
| 35 | + private final Scope scope; |
| 36 | + |
| 37 | + AudioOps(Scope scope) { |
| 38 | + this.scope = scope; |
| 39 | + } |
| 40 | + |
| 41 | + /** |
| 42 | + * Produces a visualization of audio data over time. |
| 43 | + * <p> |
| 44 | + * Spectrograms are a standard way of representing audio information as a series of |
| 45 | + * slices of frequency information, one slice for each window of time. By joining |
| 46 | + * these together into a sequence, they form a distinctive fingerprint of the sound |
| 47 | + * over time. |
| 48 | + * <p> |
| 49 | + * This op expects to receive audio data as an input, stored as floats in the range |
| 50 | + * -1 to 1, together with a window width in samples, and a stride specifying how |
| 51 | + * far to move the window between slices. From this it generates a three |
| 52 | + * dimensional output. The first dimension is for the channels in the input, so a |
| 53 | + * stereo audio input would have two here for example. The second dimension is time, |
| 54 | + * with successive frequency slices. The third dimension has an amplitude value for |
| 55 | + * each frequency during that time slice. |
| 56 | + * <p> |
| 57 | + * This means the layout when converted and saved as an image is rotated 90 degrees |
| 58 | + * clockwise from a typical spectrogram. Time is descending down the Y axis, and |
| 59 | + * the frequency decreases from left to right. |
| 60 | + * <p> |
| 61 | + * Each value in the result represents the square root of the sum of the real and |
| 62 | + * imaginary parts of an FFT on the current window of samples. In this way, the |
| 63 | + * lowest dimension represents the power of each frequency in the current window, |
| 64 | + * and adjacent windows are concatenated in the next dimension. |
| 65 | + * <p> |
| 66 | + * To get a more intuitive and visual look at what this operation does, you can run |
| 67 | + * tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the |
| 68 | + * resulting spectrogram as a PNG image. |
| 69 | + * |
| 70 | + * @param input Float representation of audio data. |
| 71 | + * @param windowSize How wide the input window is in samples. For the highest efficiency |
| 72 | + * this should be a power of two, but other values are accepted. |
| 73 | + * @param stride How widely apart the center of adjacent sample windows should be. |
| 74 | + * @param options carries optional attributes values |
| 75 | + * @return a new instance of AudioSpectrogram |
| 76 | + */ |
| 77 | + public AudioSpectrogram audioSpectrogram(Operand<TFloat32> input, Long windowSize, Long stride, |
| 78 | + AudioSpectrogram.Options... options) { |
| 79 | + return AudioSpectrogram.create(scope, input, windowSize, stride, options); |
| 80 | + } |
| 81 | + |
| 82 | + /** |
| 83 | + * Decode a 16-bit PCM WAV file to a float tensor. |
| 84 | + * <p> |
| 85 | + * The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float. |
| 86 | + * <p> |
| 87 | + * When desired_channels is set, if the input contains fewer channels than this |
| 88 | + * then the last channel will be duplicated to give the requested number, else if |
| 89 | + * the input has more channels than requested then the additional channels will be |
| 90 | + * ignored. |
| 91 | + * <p> |
| 92 | + * If desired_samples is set, then the audio will be cropped or padded with zeroes |
| 93 | + * to the requested length. |
| 94 | + * <p> |
| 95 | + * The first output contains a Tensor with the content of the audio samples. The |
| 96 | + * lowest dimension will be the number of channels, and the second will be the |
| 97 | + * number of samples. For example, a ten-sample-long stereo WAV file should give an |
| 98 | + * output shape of [10, 2]. |
| 99 | + * |
| 100 | + * @param contents The WAV-encoded audio, usually from a file. |
| 101 | + * @param options carries optional attributes values |
| 102 | + * @return a new instance of DecodeWav |
| 103 | + */ |
| 104 | + public DecodeWav decodeWav(Operand<TString> contents, DecodeWav.Options... options) { |
| 105 | + return DecodeWav.create(scope, contents, options); |
| 106 | + } |
| 107 | + |
| 108 | + /** |
| 109 | + * Encode audio data using the WAV file format. |
| 110 | + * <p> |
| 111 | + * This operation will generate a string suitable to be saved out to create a .wav |
| 112 | + * audio file. It will be encoded in the 16-bit PCM format. It takes in float |
| 113 | + * values in the range -1.0f to 1.0f, and any outside that value will be clamped to |
| 114 | + * that range. |
| 115 | + * <p> |
| 116 | + * `audio` is a 2-D float Tensor of shape `[length, channels]`. |
| 117 | + * `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100). |
| 118 | + * |
| 119 | + * @param audio 2-D with shape `[length, channels]`. |
| 120 | + * @param sampleRate Scalar containing the sample frequency. |
| 121 | + * @return a new instance of EncodeWav |
| 122 | + */ |
| 123 | + public EncodeWav encodeWav(Operand<TFloat32> audio, Operand<TInt32> sampleRate) { |
| 124 | + return EncodeWav.create(scope, audio, sampleRate); |
| 125 | + } |
| 126 | + |
| 127 | + /** |
| 128 | + * Transforms a spectrogram into a form that's useful for speech recognition. |
| 129 | + * <p> |
| 130 | + * Mel Frequency Cepstral Coefficients are a way of representing audio data that's |
| 131 | + * been effective as an input feature for machine learning. They are created by |
| 132 | + * taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the |
| 133 | + * higher frequencies that are less significant to the human ear. They have a long |
| 134 | + * history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum |
| 135 | + * is a good resource to learn more. |
| 136 | + * |
| 137 | + * @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared |
| 138 | + * set to true. |
| 139 | + * @param sampleRate How many samples per second the source audio used. |
| 140 | + * @param options carries optional attributes values |
| 141 | + * @return a new instance of Mfcc |
| 142 | + */ |
| 143 | + public Mfcc mfcc(Operand<TFloat32> spectrogram, Operand<TInt32> sampleRate, |
| 144 | + Mfcc.Options... options) { |
| 145 | + return Mfcc.create(scope, spectrogram, sampleRate, options); |
| 146 | + } |
| 147 | +} |
0 commit comments