
/************************************

  PicchioEngine

  Copyright(c)2008 Emanuele Bettidi

************************************/

/* Audio.cpp */

#include <emmintrin.h>
#include <cmath>
#include <ios>
#include <iostream>
#include <SDL/SDL.h>
#include "Types.h"
#include "Config.h"
#include "Audio.h"

namespace Audio
{
 void gen_fd_bleps();
 void callback(void *userdata, Uint8 *stream, int len);

 const float64 Pi = 3.1415926535897932384626433832795029;
 const float64 Fc = 17600.0 / (48000 * 75);  // cutoff frequency (between 0 and 0.5)  // 17600 = 11 * 1600, alt: 20800 = 13 * 1600
 const int FDBLEP_LEN = 16;  // first difference blep length
 const int PHASE_RES = 75;  // phase resolution
 const int FILTER_LEN = (PHASE_RES * (FDBLEP_LEN - 1)) - 1;  // filter length minus one

 volatile bool buffer_is_full = false;  // necessario '= false'?
 SDL_sem *semaphore;
 SDL_AudioSpec desired;
 SDL_AudioSpec obtained;

 int buffer_len;  // valid range: 1~3072
 int32 offset;
 int32 psg_cycles;
 int32 sum[2];
 float64 blep[FILTER_LEN / 2];
 int32 int_blep[PHASE_RES * FDBLEP_LEN];
 int16 fd_blep[PHASE_RES][FDBLEP_LEN] __attribute__ ((aligned (16)));
 int16 buffer[3072 * 2];
 int32 fd_buffer[2][4096] __attribute__ ((aligned (16)));
 // note: 4096 instead of 3072 for the overflow management(+16) and for optimization reasons(power of two buffer size)

 float64 float_buf[3072];
 float64 hpf_a1[2];
 float64 hpf_b1[2];
 float64 lpf1_b1[2];
 float64 lpf2_b1[2];

 int32 full_bufs = 0;
 int32 empty_bufs = 0;

 bool init()
 {
  buffer_len = Config::read_setting("audio_buffer");
  offset = 0;
  psg_cycles = 0;

  sum[0] = sum[1] = 0;
  hpf_a1[0] = hpf_a1[1] = 0;
  hpf_b1[0] = hpf_b1[1] = 0;
  lpf1_b1[0] = lpf1_b1[1] = 0;
  lpf2_b1[0] = lpf2_b1[1] = 0;

  for (int i = 0; i < (buffer_len + FDBLEP_LEN); i++) fd_buffer[0][i] = 0;  // clear the left fd buffer 
  for (int i = 0; i < (buffer_len + FDBLEP_LEN); i++) fd_buffer[1][i] = 0;  // clear the right fd buffer 
  gen_fd_bleps();
  /* SDL Audio Init */
  desired.freq = 48000;
  desired.format = AUDIO_S16SYS;
  desired.channels = 2;
  desired.samples = buffer_len ;  // valid range: 1~3072
  desired.callback = callback;
  desired.userdata = buffer;
  if (SDL_OpenAudio(&desired, &obtained) == -1) return false;
  if ((obtained.freq != 48000) || (obtained.format != AUDIO_S16SYS) || (obtained.channels != 2))
  {
   SDL_CloseAudio();
   SDL_SetError("Unsupported audio format");
   return false;
  }
  if ((obtained.samples < 2) || (obtained.samples > 3072))
  {
   SDL_CloseAudio();
   SDL_SetError("Audio buffer size out of valid range");
   return false;
  }
  buffer_len = obtained.samples;
  semaphore = SDL_CreateSemaphore(0);
  if (semaphore == 0)
  {
   SDL_CloseAudio();
   SDL_SetError("Couldn't create audio semaphore");
   return false;
  }
  std::cout << "  frequency: " << obtained.freq << "Hz" << std::endl;
  std::cout << "  channels: " << (uint)obtained.channels << std::endl;
  std::cout << "  sample format: ";
  switch (obtained.format)
  {
   case AUDIO_U8: std::cout << "unsigned 8-bit"; break;
   case AUDIO_S8: std::cout << "signed 8-bit"; break;
   case AUDIO_U16LSB: std::cout << "unsigned 16-bit little-endian";
                      if (AUDIO_U16LSB == AUDIO_U16SYS) std::cout << "(native)"; else std::cout << "(not native)";
                      break;
   case AUDIO_S16LSB: std::cout << "signed 16-bit little-endian";
                      if (AUDIO_S16LSB == AUDIO_S16SYS) std::cout << "(native)"; else std::cout << "(not native)";
                      break;
   case AUDIO_U16MSB: std::cout << "unsigned 16-bit big-endian";
                      if (AUDIO_U16MSB == AUDIO_U16SYS) std::cout << "(native)"; else std::cout << "(not native)";
                      break;
   case AUDIO_S16MSB: std::cout << "signed 16-bit big-endian";
                      if (AUDIO_S16MSB == AUDIO_S16SYS) std::cout << "(native)"; else std::cout << "(not native)";
                      break;
   default: std::cout << "unknown"; break;
  }
  std::cout << std::endl;
  std::cout << "  buffer size: " << obtained.samples << " samples" << std::endl;
  return true;
 }

 void deinit()
 {
  SDL_PauseAudio(1);
  SDL_CloseAudio();
 }

 void callback(void *userdata, Uint8 *stream, int len)
 {
  if (buffer_is_full == true)
  {
   full_bufs++;
   for (int i = 0; i < len; i++) stream[i] = ((Uint8*)userdata)[i];
   buffer_is_full = false;
   SDL_SemPost(semaphore);
  }
  else
  {
   empty_bufs++;
   for (int i = 0; i < len; i++) stream[i] = 0;
  }
 }

 void wait_callback()
 {
  get_samples();
  buffer_is_full = true;
  SDL_SemWait(semaphore);
 }

 void calc_psg_cycles()
 {
  // (3579545+(5/11))/48000 = 13125/176 = 74+(101/176)
  offset = ((psg_cycles * 176) + offset) % 13125;
  psg_cycles = ((buffer_len * 13125) - offset) / 176;
  int32 temp = ((buffer_len * 13125) - offset) % 176;
  if (temp != 0) psg_cycles++;
 }

 void gen_fd_bleps()
 {
  /* blep (band-limited step) */
  for (int i = 0; i < (FILTER_LEN / 2); i++)
  {
   blep[i] = sin(2 * Pi * Fc * (i - (FILTER_LEN / 2))) / (i - (FILTER_LEN / 2));
   blep[i] *= (0.54 - (0.46 * cos((2 * Pi * i) / FILTER_LEN)));
  }
  for (int i = 1; i < (FILTER_LEN / 2); i++) blep[i] += blep[i - 1];
  float64 k = (2 * blep[(FILTER_LEN / 2) - 1]) + (2 * Pi * Fc);
  for (int i = 0; i < (FILTER_LEN / 2); i++) blep[i] = (blep[i] / k) * 32768.0;
  for (int i = 0; i < (FILTER_LEN / 2); i++) int_blep[i] = (int32)floor(blep[i] + 0.5);
  for (int i = 0; i < (FILTER_LEN / 2); i++) int_blep[(FILTER_LEN - 1) - i] = 32768 - int_blep[i];
  for (int i = FILTER_LEN; i < (PHASE_RES * FDBLEP_LEN); i++) int_blep[i] = 32768;

  /* first difference bleps*/
  for (int phase = 0; phase < PHASE_RES; phase++)
  {
   fd_blep[(PHASE_RES - 1) - phase][0] = (int16)int_blep[phase];
   for (int i = 1; i < FDBLEP_LEN; i++)
   {
    fd_blep[(PHASE_RES - 1) - phase][i] = (int16)(int_blep[(i * PHASE_RES) + phase] - int_blep[((i - 1) * PHASE_RES) + phase]);
   }
  }
 }

 void add_blep(int32 cycle, int16 l_delta, int16 r_delta)
 {
  cycle = (psg_cycles - 1) - cycle;

  uint32 temp = (uint32)((cycle * 176) + offset);
  uint32 pos = (uint32)(((uint64)temp * (uint32)0x09FC8735) >> 32) >> 9;
  temp -= pos * 13125;
  uint32 phase = (uint32)(((uint64)temp * (uint32)0x5D9F7391) >> 32) >> 6;
  // alt: int32 pos = ((cycle * 176) + offset) / 13125;
  //      int32 phase = (((cycle * 176) + offset) % 13125) / 175;   // '/ 175' for 75 phases

  #ifdef __SSE2__
  {
   __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
   xmm0 = _mm_set1_epi16 (l_delta);
   xmm1 = _mm_set1_epi16 (r_delta);
   xmm2 = _mm_load_si128 ((__m128i const *)&fd_blep[phase][0]);
   xmm3 = _mm_load_si128 ((__m128i const *)&fd_blep[phase][8]);
   xmm4 = xmm0;
   xmm4 = _mm_mullo_epi16 (xmm4, xmm2);
   xmm5 = xmm0;
   xmm5 = _mm_mulhi_epi16 (xmm5, xmm2);
   xmm6 = xmm4;
   xmm6 = _mm_unpacklo_epi16 (xmm6, xmm5);
   xmm7 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[0][pos + 0]);
   xmm7 = _mm_add_epi32 (xmm7, xmm6);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[0][pos + 0], xmm7);
   xmm4 = _mm_unpackhi_epi16 (xmm4, xmm5);
   xmm6 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[0][pos + 4]);
   xmm6 = _mm_add_epi32 (xmm6, xmm4);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[0][pos + 4], xmm6);
   xmm5 = xmm0;
   xmm5 = _mm_mullo_epi16 (xmm5, xmm3);
   xmm0 = _mm_mulhi_epi16 (xmm0, xmm3);
   xmm4 = xmm5;
   xmm4 = _mm_unpacklo_epi16 (xmm4, xmm0);
   xmm7 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[0][pos + 8]);
   xmm7 = _mm_add_epi32 (xmm7, xmm4);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[0][pos + 8], xmm7);
   xmm5 = _mm_unpackhi_epi16 (xmm5, xmm0);
   xmm6 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[0][pos + 12]);
   xmm6 = _mm_add_epi32 (xmm6, xmm5);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[0][pos + 12], xmm6);
   xmm4 = xmm1;
   xmm4 = _mm_mullo_epi16 (xmm4, xmm2);
   xmm2 = _mm_mulhi_epi16 (xmm2, xmm1);
   xmm0 = xmm4;
   xmm0 = _mm_unpacklo_epi16 (xmm0, xmm2);
   xmm7 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[1][pos + 0]);
   xmm7 = _mm_add_epi32 (xmm7, xmm0);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[1][pos + 0], xmm7);
   xmm4 = _mm_unpackhi_epi16 (xmm4, xmm2);
   xmm6 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[1][pos + 4]);
   xmm6 = _mm_add_epi32 (xmm6, xmm4);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[1][pos + 4], xmm6);
   xmm5 = xmm1;
   xmm5 = _mm_mullo_epi16 (xmm5, xmm3);
   xmm1 = _mm_mulhi_epi16 (xmm1, xmm3);
   xmm2 = xmm5;
   xmm2 = _mm_unpacklo_epi16 (xmm2, xmm1);
   xmm7 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[1][pos + 8]);
   xmm7 = _mm_add_epi32 (xmm7, xmm2);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[1][pos + 8], xmm7);
   xmm5 = _mm_unpackhi_epi16 (xmm5, xmm1);
   xmm6 = _mm_loadu_si128 ((__m128i const *)&fd_buffer[1][pos + 12]);
   xmm6 = _mm_add_epi32 (xmm6, xmm5);
   _mm_storeu_si128 ((__m128i *)&fd_buffer[1][pos + 12], xmm6);
  }
  #else
   for (int i = 0; i < FDBLEP_LEN; i++)
   {
    fd_buffer[0][pos + i] += fd_blep[phase][i] * l_delta;
    fd_buffer[1][pos + i] += fd_blep[phase][i] * r_delta;
   }
  #endif
 }

 void get_samples()
 {
  for (int j = 0; j < 2; j++)
  {
   /* running sum */
   fd_buffer[j][0] += sum[j];
   for (int i = 1; i < buffer_len; i++) fd_buffer[j][i] += fd_buffer[j][i-1];
   sum[j] = fd_buffer[j][buffer_len - 1];

   for (int i = 0; i < buffer_len; i++) float_buf[i] = (float64)fd_buffer[j][i];

   /* single pole RC high-pass filter */
   /* R1 = 1kohm, C1 = 10uF, Fc = ~16Hz */
   const float64 x = exp(-1.0/480.0);   // x=exp(-1.0/(R1*C1*Fs)), Fs=48000Hz
   float64 temp_a0 = float_buf[0];
   float_buf[0] = (((1.0 + x) / 2.0) * (temp_a0 - hpf_a1[j])) + (x * hpf_b1[j]);
   hpf_a1[j] = temp_a0;
   for (int i = 1; i < buffer_len; i++)
   {
    temp_a0 = float_buf[i];
    float_buf[i] = (((1.0 + x) / 2.0) * (temp_a0 - hpf_a1[j])) + (x * float_buf[i - 1]);
    hpf_a1[j] = temp_a0;
   }
   hpf_b1[j] = float_buf[buffer_len - 1];

   /* first single pole RC low-pass filter */
   /* R1 = 1kohm, C2 = 10nF, Fc = ~16kHz   */
   const float64 y = exp(-1.0/0.48);   // y=exp(-1.0/(R1*C2*Fs)), Fs=48000Hz
   float_buf[0] = ((1.0 - y) * float_buf[0]) + (y * lpf1_b1[j]);
   for (int i = 1; i < buffer_len; i++)
   {
    float_buf[i] = ((1.0 - y) * float_buf[i]) + (y * float_buf[i - 1]);
   }
   lpf1_b1[j] = float_buf[buffer_len - 1];

   /* second single pole RC low-pass filter */
   /* R3 = 30kohm, C3 = 330pF, Fc = ~16kHz  */
   const float64 z = exp(-1.0/0.4752);   // z=exp(-1.0/(R3*C3*Fs)), Fs=48000Hz
   float_buf[0] = ((1.0 - z) * float_buf[0]) + (z * lpf2_b1[j]);
   for (int i = 1; i < buffer_len; i++)
   {
    float_buf[i] = ((1.0 - z) * float_buf[i]) + (z * float_buf[i - 1]);
   }
   lpf2_b1[j] = float_buf[buffer_len - 1];

   for (int i = 0; i < buffer_len; i++) buffer[(i << 1) + j] = (int16)floor((float_buf[i] * -3.0517578125E-5) + 0.5);  // '/ -32768.0' -> '* -3.0517578125E-5'
   /* note: the signal level compressor/limiter is not yet implemented */

   /* preserve unused differences and clear the fd buffer */
   for (int i = 0; i < FDBLEP_LEN; i++) fd_buffer[j][i] = fd_buffer[j][buffer_len + i];
   for (int i = FDBLEP_LEN; i < (buffer_len + FDBLEP_LEN); i++) fd_buffer[j][i] = 0;
  }
 }
}
