Linux Kernel  3.7.1
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
echo.c
Go to the documentation of this file.
1 /*
2  * SpanDSP - a series of DSP components for telephony
3  *
4  * echo.c - A line echo canceller. This code is being developed
5  * against and partially complies with G168.
6  *
7  * Written by Steve Underwood <[email protected]>
8  * and David Rowe <david_at_rowetel_dot_com>
9  *
10  * Copyright (C) 2001, 2003 Steve Underwood, 2007 David Rowe
11  *
12  * Based on a bit from here, a bit from there, eye of toad, ear of
13  * bat, 15 years of failed attempts by David and a few fried brain
14  * cells.
15  *
16  * All rights reserved.
17  *
18  * This program is free software; you can redistribute it and/or modify
19  * it under the terms of the GNU General Public License version 2, as
20  * published by the Free Software Foundation.
21  *
22  * This program is distributed in the hope that it will be useful,
23  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25  * GNU General Public License for more details.
26  *
27  * You should have received a copy of the GNU General Public License
28  * along with this program; if not, write to the Free Software
29  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30  */
31 
34 /* Implementation Notes
35  David Rowe
36  April 2007
37 
38  This code started life as Steve's NLMS algorithm with a tap
39  rotation algorithm to handle divergence during double talk. I
40  added a Geigel Double Talk Detector (DTD) [2] and performed some
41  G168 tests. However I had trouble meeting the G168 requirements,
42  especially for double talk - there were always cases where my DTD
43  failed, for example where near end speech was under the 6dB
44  threshold required for declaring double talk.
45 
46  So I tried a two path algorithm [1], which has so far given better
47  results. The original tap rotation/Geigel algorithm is available
48  in SVN http://svn.rowetel.com/software/oslec/tags/before_16bit.
49  It's probably possible to make it work if some one wants to put some
50  serious work into it.
51 
52  At present no special treatment is provided for tones, which
53  generally cause NLMS algorithms to diverge. Initial runs of a
54  subset of the G168 tests for tones (e.g ./echo_test 6) show the
55  current algorithm is passing OK, which is kind of surprising. The
56  full set of tests needs to be performed to confirm this result.
57 
58  One other interesting change is that I have managed to get the NLMS
59  code to work with 16 bit coefficients, rather than the original 32
60  bit coefficents. This reduces the MIPs and storage required.
61  I evaulated the 16 bit port using g168_tests.sh and listening tests
62  on 4 real-world samples.
63 
64  I also attempted the implementation of a block based NLMS update
65  [2] but although this passes g168_tests.sh it didn't converge well
66  on the real-world samples. I have no idea why, perhaps a scaling
67  problem. The block based code is also available in SVN
68  http://svn.rowetel.com/software/oslec/tags/before_16bit. If this
69  code can be debugged, it will lead to further reduction in MIPS, as
70  the block update code maps nicely onto DSP instruction sets (it's a
71  dot product) compared to the current sample-by-sample update.
72 
73  Steve also has some nice notes on echo cancellers in echo.h
74 
75  References:
76 
77  [1] Ochiai, Areseki, and Ogihara, "Echo Canceller with Two Echo
78  Path Models", IEEE Transactions on communications, COM-25,
79  No. 6, June
80  1977.
81  http://www.rowetel.com/images/echo/dual_path_paper.pdf
82 
83  [2] The classic, very useful paper that tells you how to
84  actually build a real world echo canceller:
85  Messerschmitt, Hedberg, Cole, Haoui, Winship, "Digital Voice
86  Echo Canceller with a TMS320020,
87  http://www.rowetel.com/images/echo/spra129.pdf
88 
89  [3] I have written a series of blog posts on this work, here is
90  Part 1: http://www.rowetel.com/blog/?p=18
91 
92  [4] The source code http://svn.rowetel.com/software/oslec/
93 
94  [5] A nice reference on LMS filters:
95  http://en.wikipedia.org/wiki/Least_mean_squares_filter
96 
97  Credits:
98 
99  Thanks to Steve Underwood, Jean-Marc Valin, and Ramakrishnan
100  Muthukrishnan for their suggestions and email discussions. Thanks
101  also to those people who collected echo samples for me such as
102  Mark, Pawel, and Pavel.
103 */
104 
105 #include <linux/kernel.h>
106 #include <linux/module.h>
107 #include <linux/slab.h>
108 
109 #include "echo.h"
110 
111 #define MIN_TX_POWER_FOR_ADAPTION 64
112 #define MIN_RX_POWER_FOR_ADAPTION 64
113 #define DTD_HANGOVER 600 /* 600 samples, or 75ms */
114 #define DC_LOG2BETA 3 /* log2() of DC filter Beta */
115 
116 /* adapting coeffs using the traditional stochastic descent (N)LMS algorithm */
117 
118 #ifdef __bfin__
119 static inline void lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
120 {
121  int i;
122  int j;
123  int offset1;
124  int offset2;
125  int factor;
126  int exp;
127  int16_t *phist;
128  int n;
129 
130  if (shift > 0)
131  factor = clean << shift;
132  else
133  factor = clean >> -shift;
134 
135  /* Update the FIR taps */
136 
137  offset2 = ec->curr_pos;
138  offset1 = ec->taps - offset2;
139  phist = &ec->fir_state_bg.history[offset2];
140 
141  /* st: and en: help us locate the assembler in echo.s */
142 
143  /* asm("st:"); */
144  n = ec->taps;
145  for (i = 0, j = offset2; i < n; i++, j++) {
146  exp = *phist++ * factor;
147  ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
148  }
149  /* asm("en:"); */
150 
151  /* Note the asm for the inner loop above generated by Blackfin gcc
152  4.1.1 is pretty good (note even parallel instructions used):
153 
154  R0 = W [P0++] (X);
155  R0 *= R2;
156  R0 = R0 + R3 (NS) ||
157  R1 = W [P1] (X) ||
158  nop;
159  R0 >>>= 15;
160  R0 = R0 + R1;
161  W [P1++] = R0;
162 
163  A block based update algorithm would be much faster but the
164  above can't be improved on much. Every instruction saved in
165  the loop above is 2 MIPs/ch! The for loop above is where the
166  Blackfin spends most of it's time - about 17 MIPs/ch measured
167  with speedtest.c with 256 taps (32ms). Write-back and
168  Write-through cache gave about the same performance.
169  */
170 }
171 
172 /*
173  IDEAS for further optimisation of lms_adapt_bg():
174 
175  1/ The rounding is quite costly. Could we keep as 32 bit coeffs
176  then make filter pluck the MS 16-bits of the coeffs when filtering?
177  However this would lower potential optimisation of filter, as I
178  think the dual-MAC architecture requires packed 16 bit coeffs.
179 
180  2/ Block based update would be more efficient, as per comments above,
181  could use dual MAC architecture.
182 
183  3/ Look for same sample Blackfin LMS code, see if we can get dual-MAC
184  packing.
185 
186  4/ Execute the whole e/c in a block of say 20ms rather than sample
187  by sample. Processing a few samples every ms is inefficient.
188 */
189 
190 #else
191 static inline void lms_adapt_bg(struct oslec_state *ec, int clean, int shift)
192 {
193  int i;
194 
195  int offset1;
196  int offset2;
197  int factor;
198  int exp;
199 
200  if (shift > 0)
201  factor = clean << shift;
202  else
203  factor = clean >> -shift;
204 
205  /* Update the FIR taps */
206 
207  offset2 = ec->curr_pos;
208  offset1 = ec->taps - offset2;
209 
210  for (i = ec->taps - 1; i >= offset1; i--) {
211  exp = (ec->fir_state_bg.history[i - offset1] * factor);
212  ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
213  }
214  for (; i >= 0; i--) {
215  exp = (ec->fir_state_bg.history[i + offset2] * factor);
216  ec->fir_taps16[1][i] += (int16_t) ((exp + (1 << 14)) >> 15);
217  }
218 }
219 #endif
220 
221 static inline int top_bit(unsigned int bits)
222 {
223  if (bits == 0)
224  return -1;
225  else
226  return (int)fls((int32_t) bits) - 1;
227 }
228 
230 {
231  struct oslec_state *ec;
232  int i;
233 
234  ec = kzalloc(sizeof(*ec), GFP_KERNEL);
235  if (!ec)
236  return NULL;
237 
238  ec->taps = len;
239  ec->log2taps = top_bit(len);
240  ec->curr_pos = ec->taps - 1;
241 
242  for (i = 0; i < 2; i++) {
243  ec->fir_taps16[i] =
244  kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
245  if (!ec->fir_taps16[i])
246  goto error_oom;
247  }
248 
249  fir16_create(&ec->fir_state, ec->fir_taps16[0], ec->taps);
250  fir16_create(&ec->fir_state_bg, ec->fir_taps16[1], ec->taps);
251 
252  for (i = 0; i < 5; i++)
253  ec->xvtx[i] = ec->yvtx[i] = ec->xvrx[i] = ec->yvrx[i] = 0;
254 
255  ec->cng_level = 1000;
256  oslec_adaption_mode(ec, adaption_mode);
257 
258  ec->snapshot = kcalloc(ec->taps, sizeof(int16_t), GFP_KERNEL);
259  if (!ec->snapshot)
260  goto error_oom;
261 
262  ec->cond_met = 0;
263  ec->Pstates = 0;
264  ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
265  ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
266  ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
267  ec->Lbgn = ec->Lbgn_acc = 0;
268  ec->Lbgn_upper = 200;
269  ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
270 
271  return ec;
272 
273 error_oom:
274  for (i = 0; i < 2; i++)
275  kfree(ec->fir_taps16[i]);
276 
277  kfree(ec);
278  return NULL;
279 }
281 
282 void oslec_free(struct oslec_state *ec)
283 {
284  int i;
285 
286  fir16_free(&ec->fir_state);
287  fir16_free(&ec->fir_state_bg);
288  for (i = 0; i < 2; i++)
289  kfree(ec->fir_taps16[i]);
290  kfree(ec->snapshot);
291  kfree(ec);
292 }
294 
296 {
298 }
300 
301 void oslec_flush(struct oslec_state *ec)
302 {
303  int i;
304 
305  ec->Ltxacc = ec->Lrxacc = ec->Lcleanacc = ec->Lclean_bgacc = 0;
306  ec->Ltx = ec->Lrx = ec->Lclean = ec->Lclean_bg = 0;
307  ec->tx_1 = ec->tx_2 = ec->rx_1 = ec->rx_2 = 0;
308 
309  ec->Lbgn = ec->Lbgn_acc = 0;
310  ec->Lbgn_upper = 200;
311  ec->Lbgn_upper_acc = ec->Lbgn_upper << 13;
312 
313  ec->nonupdate_dwell = 0;
314 
315  fir16_flush(&ec->fir_state);
316  fir16_flush(&ec->fir_state_bg);
317  ec->fir_state.curr_pos = ec->taps - 1;
318  ec->fir_state_bg.curr_pos = ec->taps - 1;
319  for (i = 0; i < 2; i++)
320  memset(ec->fir_taps16[i], 0, ec->taps * sizeof(int16_t));
321 
322  ec->curr_pos = ec->taps - 1;
323  ec->Pstates = 0;
324 }
326 
327 void oslec_snapshot(struct oslec_state *ec)
328 {
329  memcpy(ec->snapshot, ec->fir_taps16[0], ec->taps * sizeof(int16_t));
330 }
332 
333 /* Dual Path Echo Canceller */
334 
336 {
337  int32_t echo_value;
338  int clean_bg;
339  int tmp;
340  int tmp1;
341 
342  /*
343  * Input scaling was found be required to prevent problems when tx
344  * starts clipping. Another possible way to handle this would be the
345  * filter coefficent scaling.
346  */
347 
348  ec->tx = tx;
349  ec->rx = rx;
350  tx >>= 1;
351  rx >>= 1;
352 
353  /*
354  * Filter DC, 3dB point is 160Hz (I think), note 32 bit precision
355  * required otherwise values do not track down to 0. Zero at DC, Pole
356  * at (1-Beta) on real axis. Some chip sets (like Si labs) don't
357  * need this, but something like a $10 X100P card does. Any DC really
358  * slows down convergence.
359  *
360  * Note: removes some low frequency from the signal, this reduces the
361  * speech quality when listening to samples through headphones but may
362  * not be obvious through a telephone handset.
363  *
364  * Note that the 3dB frequency in radians is approx Beta, e.g. for Beta
365  * = 2^(-3) = 0.125, 3dB freq is 0.125 rads = 159Hz.
366  */
367 
369  tmp = rx << 15;
370 
371  /*
372  * Make sure the gain of the HPF is 1.0. This can still
373  * saturate a little under impulse conditions, and it might
374  * roll to 32768 and need clipping on sustained peak level
375  * signals. However, the scale of such clipping is small, and
376  * the error due to any saturation should not markedly affect
377  * the downstream processing.
378  */
379  tmp -= (tmp >> 4);
380 
381  ec->rx_1 += -(ec->rx_1 >> DC_LOG2BETA) + tmp - ec->rx_2;
382 
383  /*
384  * hard limit filter to prevent clipping. Note that at this
385  * stage rx should be limited to +/- 16383 due to right shift
386  * above
387  */
388  tmp1 = ec->rx_1 >> 15;
389  if (tmp1 > 16383)
390  tmp1 = 16383;
391  if (tmp1 < -16383)
392  tmp1 = -16383;
393  rx = tmp1;
394  ec->rx_2 = tmp;
395  }
396 
397  /* Block average of power in the filter states. Used for
398  adaption power calculation. */
399 
400  {
401  int new, old;
402 
403  /* efficient "out with the old and in with the new" algorithm so
404  we don't have to recalculate over the whole block of
405  samples. */
406  new = (int)tx * (int)tx;
407  old = (int)ec->fir_state.history[ec->fir_state.curr_pos] *
408  (int)ec->fir_state.history[ec->fir_state.curr_pos];
409  ec->Pstates +=
410  ((new - old) + (1 << (ec->log2taps - 1))) >> ec->log2taps;
411  if (ec->Pstates < 0)
412  ec->Pstates = 0;
413  }
414 
415  /* Calculate short term average levels using simple single pole IIRs */
416 
417  ec->Ltxacc += abs(tx) - ec->Ltx;
418  ec->Ltx = (ec->Ltxacc + (1 << 4)) >> 5;
419  ec->Lrxacc += abs(rx) - ec->Lrx;
420  ec->Lrx = (ec->Lrxacc + (1 << 4)) >> 5;
421 
422  /* Foreground filter */
423 
424  ec->fir_state.coeffs = ec->fir_taps16[0];
425  echo_value = fir16(&ec->fir_state, tx);
426  ec->clean = rx - echo_value;
427  ec->Lcleanacc += abs(ec->clean) - ec->Lclean;
428  ec->Lclean = (ec->Lcleanacc + (1 << 4)) >> 5;
429 
430  /* Background filter */
431 
432  echo_value = fir16(&ec->fir_state_bg, tx);
433  clean_bg = rx - echo_value;
434  ec->Lclean_bgacc += abs(clean_bg) - ec->Lclean_bg;
435  ec->Lclean_bg = (ec->Lclean_bgacc + (1 << 4)) >> 5;
436 
437  /* Background Filter adaption */
438 
439  /* Almost always adap bg filter, just simple DT and energy
440  detection to minimise adaption in cases of strong double talk.
441  However this is not critical for the dual path algorithm.
442  */
443  ec->factor = 0;
444  ec->shift = 0;
445  if ((ec->nonupdate_dwell == 0)) {
446  int P, logP, shift;
447 
448  /* Determine:
449 
450  f = Beta * clean_bg_rx/P ------ (1)
451 
452  where P is the total power in the filter states.
453 
454  The Boffins have shown that if we obey (1) we converge
455  quickly and avoid instability.
456 
457  The correct factor f must be in Q30, as this is the fixed
458  point format required by the lms_adapt_bg() function,
459  therefore the scaled version of (1) is:
460 
461  (2^30) * f = (2^30) * Beta * clean_bg_rx/P
462  factor = (2^30) * Beta * clean_bg_rx/P ----- (2)
463 
464  We have chosen Beta = 0.25 by experiment, so:
465 
466  factor = (2^30) * (2^-2) * clean_bg_rx/P
467 
468  (30 - 2 - log2(P))
469  factor = clean_bg_rx 2 ----- (3)
470 
471  To avoid a divide we approximate log2(P) as top_bit(P),
472  which returns the position of the highest non-zero bit in
473  P. This approximation introduces an error as large as a
474  factor of 2, but the algorithm seems to handle it OK.
475 
476  Come to think of it a divide may not be a big deal on a
477  modern DSP, so its probably worth checking out the cycles
478  for a divide versus a top_bit() implementation.
479  */
480 
482  logP = top_bit(P) + ec->log2taps;
483  shift = 30 - 2 - logP;
484  ec->shift = shift;
485 
486  lms_adapt_bg(ec, clean_bg, shift);
487  }
488 
489  /* very simple DTD to make sure we dont try and adapt with strong
490  near end speech */
491 
492  ec->adapt = 0;
493  if ((ec->Lrx > MIN_RX_POWER_FOR_ADAPTION) && (ec->Lrx > ec->Ltx))
495  if (ec->nonupdate_dwell)
496  ec->nonupdate_dwell--;
497 
498  /* Transfer logic */
499 
500  /* These conditions are from the dual path paper [1], I messed with
501  them a bit to improve performance. */
502 
503  if ((ec->adaption_mode & ECHO_CAN_USE_ADAPTION) &&
504  (ec->nonupdate_dwell == 0) &&
505  /* (ec->Lclean_bg < 0.875*ec->Lclean) */
506  (8 * ec->Lclean_bg < 7 * ec->Lclean) &&
507  /* (ec->Lclean_bg < 0.125*ec->Ltx) */
508  (8 * ec->Lclean_bg < ec->Ltx)) {
509  if (ec->cond_met == 6) {
510  /*
511  * BG filter has had better results for 6 consecutive
512  * samples
513  */
514  ec->adapt = 1;
515  memcpy(ec->fir_taps16[0], ec->fir_taps16[1],
516  ec->taps * sizeof(int16_t));
517  } else
518  ec->cond_met++;
519  } else
520  ec->cond_met = 0;
521 
522  /* Non-Linear Processing */
523 
524  ec->clean_nlp = ec->clean;
525  if (ec->adaption_mode & ECHO_CAN_USE_NLP) {
526  /*
527  * Non-linear processor - a fancy way to say "zap small
528  * signals, to avoid residual echo due to (uLaw/ALaw)
529  * non-linearity in the channel.".
530  */
531 
532  if ((16 * ec->Lclean < ec->Ltx)) {
533  /*
534  * Our e/c has improved echo by at least 24 dB (each
535  * factor of 2 is 6dB, so 2*2*2*2=16 is the same as
536  * 6+6+6+6=24dB)
537  */
538  if (ec->adaption_mode & ECHO_CAN_USE_CNG) {
539  ec->cng_level = ec->Lbgn;
540 
541  /*
542  * Very elementary comfort noise generation.
543  * Just random numbers rolled off very vaguely
544  * Hoth-like. DR: This noise doesn't sound
545  * quite right to me - I suspect there are some
546  * overflow issues in the filtering as it's too
547  * "crackly".
548  * TODO: debug this, maybe just play noise at
549  * high level or look at spectrum.
550  */
551 
552  ec->cng_rndnum =
553  1664525U * ec->cng_rndnum + 1013904223U;
554  ec->cng_filter =
555  ((ec->cng_rndnum & 0xFFFF) - 32768 +
556  5 * ec->cng_filter) >> 3;
557  ec->clean_nlp =
558  (ec->cng_filter * ec->cng_level * 8) >> 14;
559 
560  } else if (ec->adaption_mode & ECHO_CAN_USE_CLIP) {
561  /* This sounds much better than CNG */
562  if (ec->clean_nlp > ec->Lbgn)
563  ec->clean_nlp = ec->Lbgn;
564  if (ec->clean_nlp < -ec->Lbgn)
565  ec->clean_nlp = -ec->Lbgn;
566  } else {
567  /*
568  * just mute the residual, doesn't sound very
569  * good, used mainly in G168 tests
570  */
571  ec->clean_nlp = 0;
572  }
573  } else {
574  /*
575  * Background noise estimator. I tried a few
576  * algorithms here without much luck. This very simple
577  * one seems to work best, we just average the level
578  * using a slow (1 sec time const) filter if the
579  * current level is less than a (experimentally
580  * derived) constant. This means we dont include high
581  * level signals like near end speech. When combined
582  * with CNG or especially CLIP seems to work OK.
583  */
584  if (ec->Lclean < 40) {
585  ec->Lbgn_acc += abs(ec->clean) - ec->Lbgn;
586  ec->Lbgn = (ec->Lbgn_acc + (1 << 11)) >> 12;
587  }
588  }
589  }
590 
591  /* Roll around the taps buffer */
592  if (ec->curr_pos <= 0)
593  ec->curr_pos = ec->taps;
594  ec->curr_pos--;
595 
597  ec->clean_nlp = rx;
598 
599  /* Output scaled back up again to match input scaling */
600 
601  return (int16_t) ec->clean_nlp << 1;
602 }
604 
605 /* This function is separated from the echo canceller is it is usually called
606  as part of the tx process. See rx HP (DC blocking) filter above, it's
607  the same design.
608 
609  Some soft phones send speech signals with a lot of low frequency
610  energy, e.g. down to 20Hz. This can make the hybrid non-linear
611  which causes the echo canceller to fall over. This filter can help
612  by removing any low frequency before it gets to the tx port of the
613  hybrid.
614 
615  It can also help by removing and DC in the tx signal. DC is bad
616  for LMS algorithms.
617 
618  This is one of the classic DC removal filters, adjusted to provide
619  sufficient bass rolloff to meet the above requirement to protect hybrids
620  from things that upset them. The difference between successive samples
621  produces a lousy HPF, and then a suitably placed pole flattens things out.
622  The final result is a nicely rolled off bass end. The filtering is
623  implemented with extended fractional precision, which noise shapes things,
624  giving very clean DC removal.
625 */
626 
628 {
629  int tmp;
630  int tmp1;
631 
633  tmp = tx << 15;
634 
635  /*
636  * Make sure the gain of the HPF is 1.0. The first can still
637  * saturate a little under impulse conditions, and it might
638  * roll to 32768 and need clipping on sustained peak level
639  * signals. However, the scale of such clipping is small, and
640  * the error due to any saturation should not markedly affect
641  * the downstream processing.
642  */
643  tmp -= (tmp >> 4);
644 
645  ec->tx_1 += -(ec->tx_1 >> DC_LOG2BETA) + tmp - ec->tx_2;
646  tmp1 = ec->tx_1 >> 15;
647  if (tmp1 > 32767)
648  tmp1 = 32767;
649  if (tmp1 < -32767)
650  tmp1 = -32767;
651  tx = tmp1;
652  ec->tx_2 = tmp;
653  }
654 
655  return tx;
656 }
658 
659 MODULE_LICENSE("GPL");
660 MODULE_AUTHOR("David Rowe");
661 MODULE_DESCRIPTION("Open Source Line Echo Canceller");
662 MODULE_VERSION("0.3.0");