<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta charset="utf-8">
<meta name="generator" content="pdf2htmlEX">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<link rel="stylesheet" href="https://static.pudn.com/base/css/base.min.css">
<link rel="stylesheet" href="https://static.pudn.com/base/css/fancy.min.css">
<link rel="stylesheet" href="https://static.pudn.com/prod/directory_preview_static/625b94e3be9ad24cfa1d841d/raw.css">
<script src="https://static.pudn.com/base/js/compatibility.min.js"></script>
<script src="https://static.pudn.com/base/js/pdf2htmlEX.min.js"></script>
<script>
try{
pdf2htmlEX.defaultViewer = new pdf2htmlEX.Viewer({});
}catch(e){}
</script>
<title></title>
</head>
<body>
<div id="sidebar" style="display: none">
<div id="outline">
</div>
</div>
<div id="pf1" class="pf w0 h0" data-page-no="1"><div class="pc pc1 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://static.pudn.com/prod/directory_preview_static/625b94e3be9ad24cfa1d841d/bg1.jpg"><div class="t m0 x1 h2 y1 ff1 fs0 fc0 sc0 ls0 ws0">V<span class="_ _0"></span>oicePop:<span class="_ _1"> </span>A<span class="_ _1"> </span>Pop<span class="_ _1"> </span>Noise<span class="_ _1"> </span>based<span class="_ _1"> </span>Anti-spoofing<span class="_ _1"> </span>System</div><div class="t m0 x2 h2 y2 ff1 fs0 fc0 sc0 ls0 ws0">for<span class="_ _2"> </span>V<span class="_ _0"></span>oice<span class="_ _2"> </span>Authentication<span class="_ _2"> </span>on<span class="_ _2"> </span>Smartphones</div><div class="t m0 x3 h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">Qian<span class="_ _3"> </span>W<span class="_ _4"></span>ang</div><div class="t m0 x4 h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">†</div><div class="t m0 x5 h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">,<span class="_ _3"> </span>Xiu<span class="_ _3"> </span>Lin</div><div class="t m0 x6 h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">†</div><div class="t m0 x7 h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">,<span class="_ _3"> </span>Man<span class="_ _3"> </span>Zhou</div><div class="t m0 x8 h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">†</div><div class="t m0 x9 h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">,<span class="_ _3"> </span>Y<span class="_ _4"></span>anjiao<span class="_ _3"> </span>Chen</div><div class="t m0 xa h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">∗</div><div class="t m0 xb h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">,<span class="_ _3"> </span>Cong<span class="_ _3"> </span>W<span class="_ _4"></span>ang</div><div class="t m0 xc h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">‡</div><div class="t m0 xd h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">,<span class="_ _3"> </span>Qi<span class="_ _3"> </span>Li</div><div class="t m0 xe h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">§</div><div class="t m0 xf h3 y3 ff1 fs1 fc0 sc0 ls0 ws0">,<span class="_ _3"> </span>Xiangyang<span class="_ _3"> </span>Luo</div><div class="t m0 x10 h4 y4 ff2 fs2 fc0 sc0 ls0 ws0">¶</div><div class="t m0 x4 h4 y5 ff2 fs2 fc0 sc0 ls0 ws0">†</div><div class="t m0 x5 h5 y6 ff1 fs3 fc0 sc0 ls0 ws0">School<span class="_ _5"> </span>of<span class="_ _5"> </span>Cyber<span class="_ _5"> </span>Science<span class="_ _5"> </span>and<span class="_ _5"> </span>Engineering,<span class="_ _5"> </span>W<span class="_ _6"></span>uhan<span class="_ _5"> </span>Univ<span class="_ _6"></span>ersity<span class="_ _6"></span>,<span class="_ _5"> </span>P<span class="_ _7"></span>.<span class="_ _5"> </span>R.<span class="_ _5"> </span>China.</div><div class="t m0 x11 h4 y7 ff2 fs2 fc0 sc0 ls0 ws0">∗</div><div class="t m0 x12 h5 y8 ff1 fs3 fc0 sc0 ls0 ws0">School<span class="_ _5"> </span>of<span class="_ _5"> </span>Computer<span class="_ _5"> </span>Science,<span class="_ _5"> </span>W<span class="_ _6"></span>uhan<span class="_ _5"> </span>Univ<span class="_ _6"></span>ersity<span class="_ _6"></span>,<span class="_ _5"> </span>P<span class="_ _7"></span>.<span class="_ _5"> </span>R.<span class="_ _5"> </span>China.</div><div class="t m0 x13 h4 y9 ff2 fs2 fc0 sc0 ls0 ws0">‡</div><div class="t m0 x14 h5 ya ff1 fs3 fc0 sc0 ls0 ws0">Department<span class="_ _5"> </span>of<span class="_ _5"> </span>Computer<span class="_ _5"> </span>Science,<span class="_ _5"> </span>City<span class="_ _5"> </span>Univ<span class="_ _6"></span>ersity<span class="_ _5"> </span>of<span class="_ _5"> </span>Hong<span class="_ _5"> </span>Kong,<span class="_ _5"> </span>Hong<span class="_ _5"> </span>K<span class="_ _6"></span>ong,<span class="_ _5"> </span>P<span class="_ _7"></span>.<span class="_ _5"> </span>R.<span class="_ _5"> </span>China.</div><div class="t m0 x15 h4 yb ff2 fs2 fc0 sc0 ls0 ws0">§</div><div class="t m0 x16 h5 yc ff1 fs3 fc0 sc0 ls0 ws0">Institute<span class="_ _5"> </span>for<span class="_ _5"> </span>Network<span class="_ _5"> </span>Sciences<span class="_ _5"> </span>and<span class="_ _5"> </span>Cyberspace,<span class="_ _5"> </span>Tsinghua<span class="_ _5"> </span>Uni<span class="_ _6"></span>versity<span class="_ _4"></span>,<span class="_ _5"> </span>P<span class="_ _4"></span>.<span class="_ _5"> </span>R.<span class="_ _5"> </span>China.</div><div class="t m0 x17 h4 yd ff2 fs2 fc0 sc0 ls0 ws0">¶</div><div class="t m0 x18 h5 ye ff1 fs3 fc0 sc0 ls0 ws0">The<span class="_ _5"> </span>State<span class="_ _5"> </span>Ke<span class="_ _6"></span>y<span class="_ _5"> </span>Laboratory<span class="_ _5"> </span>of<span class="_ _5"> </span>Mathematical<span class="_ _5"> </span>Engineering<span class="_ _5"> </span>and<span class="_ _5"> </span>Advanced<span class="_ _5"> </span>Computing,<span class="_ _5"> </span>Zhengzhou,<span class="_ _5"> </span>P<span class="_ _7"></span>.<span class="_ _5"> </span>R.<span class="_ _5"> </span>China.</div><div class="t m0 x19 h6 yf ff3 fs4 fc0 sc0 ls0 ws0">Abstract<span class="ff4">—V<span class="_ _4"></span>oice<span class="_ _8"> </span>biometrics<span class="_ _8"> </span>is<span class="_ _8"> </span>widely<span class="_ _8"> </span>adopted<span class="_ _8"> </span>f<span class="_ _6"></span>or<span class="_ _8"> </span>identity</span></div><div class="t m0 x1 h7 y10 ff4 fs4 fc0 sc0 ls0 ws0">authentication<span class="_ _3"> </span>in<span class="_ _3"> </span>mobile<span class="_ _5"> </span>devices.<span class="_ _3"> </span>However<span class="_ _7"></span>,<span class="_ _3"> </span>voice<span class="_ _3"> </span>authentication</div><div class="t m0 x1 h7 y11 ff4 fs4 fc0 sc0 ls0 ws0">is<span class="_ _9"> </span>vulnerable<span class="_ _9"> </span>to<span class="_ _9"> </span>spoofing<span class="_ _9"> </span>attacks,<span class="_ _9"> </span>where<span class="_ _9"> </span>an<span class="_ _9"> </span>adv<span class="_ _6"></span>ersary<span class="_ _9"> </span>may</div><div class="t m0 x1 h7 y12 ff4 fs4 fc0 sc0 ls0 ws0">deceive<span class="_ _a"> </span>the<span class="_ _a"> </span>voice<span class="_ _b"> </span>authentication<span class="_ _a"> </span>system<span class="_ _b"> </span>with<span class="_ _b"> </span>pre-r<span class="_ _6"></span>ecorded<span class="_ _b"> </span>or</div><div class="t m0 x1 h7 y13 ff4 fs4 fc0 sc0 ls0 ws0">synthesized<span class="_"> </span>samples<span class="_"> </span>from<span class="_"> </span>the<span class="_"> </span>legitimate<span class="_ _c"> </span>user<span class="_"> </span>or<span class="_"> </span>by<span class="_ _c"> </span>impersonating</div><div class="t m0 x1 h7 y14 ff4 fs4 fc0 sc0 ls0 ws0">the<span class="_ _5"> </span>speaking<span class="_ _5"> </span>style<span class="_ _5"> </span>of<span class="_ _5"> </span>the<span class="_ _d"> </span>targeted<span class="_ _d"> </span>user<span class="_ _4"></span>.<span class="_ _5"> </span>In<span class="_ _5"> </span>this<span class="_ _5"> </span>paper<span class="_ _4"></span>,<span class="_ _5"> </span>we<span class="_ _d"> </span>design</div><div class="t m0 x1 h7 y15 ff4 fs4 fc0 sc0 ls0 ws0">and<span class="_ _e"> </span>implement<span class="_ _e"> </span>V<span class="_ _4"></span>oicePop,<span class="_ _e"> </span>a<span class="_ _e"> </span>r<span class="_ _6"></span>obust<span class="_ _e"> </span>software-only<span class="_ _e"> </span>anti-spoofing</div><div class="t m0 x1 h6 y16 ff4 fs4 fc0 sc0 ls0 ws0">system<span class="_ _d"> </span>on<span class="_ _f"> </span>smartphones.<span class="_ _f"> </span>V<span class="_ _4"></span>oicePop<span class="_ _f"> </span>leverages<span class="_ _f"> </span>the<span class="_ _d"> </span><span class="ff3">pop<span class="_ _f"> </span>noise</span>,<span class="_ _d"> </span>which</div><div class="t m0 x1 h7 y17 ff4 fs4 fc0 sc0 ls0 ws0">is<span class="_ _10"> </span>produced<span class="_ _3"> </span>by<span class="_ _10"> </span>the<span class="_ _10"> </span>user<span class="_ _10"> </span>breathing<span class="_ _10"> </span>while<span class="_ _10"> </span>speaking<span class="_ _10"> </span>close<span class="_ _10"> </span>to<span class="_ _10"> </span>the</div><div class="t m0 x1 h7 y18 ff4 fs4 fc0 sc0 ls0 ws0">microphone.<span class="_ _9"> </span>The<span class="_ _11"> </span>pop<span class="_ _11"> </span>noise<span class="_ _9"> </span>is<span class="_ _9"> </span>delicate<span class="_ _9"> </span>and<span class="_ _11"> </span>subject<span class="_ _9"> </span>to<span class="_ _9"> </span>user</div><div class="t m0 x1 h7 y19 ff4 fs4 fc0 sc0 ls0 ws0">diversity<span class="_ _4"></span>,<span class="_ _a"> </span>making<span class="_ _e"> </span>it<span class="_ _e"> </span>hard<span class="_ _a"> </span>to<span class="_ _e"> </span>record<span class="_ _e"> </span>by<span class="_ _e"> </span>replay<span class="_ _e"> </span>attacks<span class="_ _e"> </span>beyond</div><div class="t m0 x1 h7 y1a ff4 fs4 fc0 sc0 ls0 ws0">a<span class="_ _a"> </span>certain<span class="_ _a"> </span>distance<span class="_ _e"> </span>and<span class="_ _a"> </span>to<span class="_ _a"> </span>imitate<span class="_ _a"> </span>precisely<span class="_ _e"> </span>by<span class="_ _a"> </span>impersonators.</div><div class="t m0 x1 h7 y1b ff4 fs4 fc0 sc0 ls0 ws0">W<span class="_ _6"></span>e<span class="_ _3"> </span>design<span class="_ _3"> </span>a<span class="_ _3"> </span>novel<span class="_ _5"> </span>pop<span class="_ _3"> </span>noise<span class="_ _3"> </span>detection<span class="_ _10"> </span>scheme<span class="_ _3"> </span>to<span class="_ _3"> </span>pinpoint<span class="_ _3"> </span>pop</div><div class="t m0 x1 h7 y1c ff4 fs4 fc0 sc0 ls0 ws0">noises<span class="_ _5"> </span>at<span class="_ _5"> </span>the<span class="_ _3"> </span>phonemic<span class="_ _5"> </span>level,<span class="_ _d"> </span>based<span class="_ _3"> </span>on<span class="_ _5"> </span>which<span class="_ _5"> </span>we<span class="_ _5"> </span>establish<span class="_ _3"> </span>indi-</div><div class="t m0 x1 h7 y1d ff4 fs4 fc0 sc0 ls0 ws0">vidually<span class="_ _3"> </span>unique<span class="_ _3"> </span>r<span class="_ _6"></span>elationship<span class="_ _3"> </span>between<span class="_ _3"> </span>phonemes<span class="_ _3"> </span>and<span class="_ _3"> </span>pop<span class="_ _3"> </span>noises</div><div class="t m0 x1 h7 y1e ff4 fs4 fc0 sc0 ls0 ws0">to<span class="_ _d"> </span>identify<span class="_ _5"> </span>legitimate<span class="_ _5"> </span>users<span class="_ _d"> </span>and<span class="_ _5"> </span>defend<span class="_ _5"> </span>against<span class="_ _5"> </span>spoofing<span class="_ _d"> </span>attacks.</div><div class="t m0 x1 h7 y1f ff4 fs4 fc0 sc0 ls0 ws0">Our<span class="_ _f"> </span>experimental<span class="_ _f"> </span>results<span class="_ _f"> </span>with<span class="_ _f"> </span>18<span class="_ _d"> </span>participants<span class="_ _f"> </span>and<span class="_ _f"> </span>three<span class="_ _f"> </span>types<span class="_ _f"> </span>of</div><div class="t m0 x1 h7 y20 ff4 fs4 fc0 sc0 ls0 ws0">smartphones<span class="_ _d"> </span>show<span class="_ _5"> </span>that<span class="_ _5"> </span>V<span class="_ _7"></span>oicePop<span class="_ _d"> </span>achieves<span class="_ _d"> </span>over<span class="_ _5"> </span>93.5%<span class="_ _d"> </span>detection</div><div class="t m0 x1 h7 y21 ff4 fs4 fc0 sc0 ls0 ws0">accuracy<span class="_ _e"> </span>at<span class="_ _10"> </span>around<span class="_ _e"> </span>5.4%<span class="_ _10"> </span>equal<span class="_ _e"> </span>error<span class="_ _e"> </span>rate.<span class="_ _10"> </span>V<span class="_ _4"></span>oicePop<span class="_ _e"> </span>requir<span class="_ _6"></span>es</div><div class="t m0 x1 h7 y22 ff4 fs4 fc0 sc0 ls0 ws0">no<span class="_ _a"> </span>additional<span class="_ _a"> </span>hardware<span class="_ _e"> </span>but<span class="_ _a"> </span>only<span class="_ _a"> </span>the<span class="_ _a"> </span>built-in<span class="_ _a"> </span>microphones<span class="_ _a"> </span>in</div><div class="t m0 x1 h7 y23 ff4 fs4 fc0 sc0 ls0 ws0">virtually<span class="_ _e"> </span>all<span class="_ _a"> </span>smartphones,<span class="_ _a"> </span>which<span class="_ _a"> </span>can<span class="_ _e"> </span>be<span class="_ _a"> </span>readily<span class="_ _e"> </span>integrated<span class="_ _a"> </span>in</div><div class="t m0 x1 h7 y24 ff4 fs4 fc0 sc0 ls0 ws0">existing<span class="_ _d"> </span>voice<span class="_ _5"> </span>authentication<span class="_ _d"> </span>systems<span class="_ _5"> </span>for<span class="_ _d"> </span>mobile<span class="_ _d"> </span>devices.</div><div class="t m0 x1a h5 y25 ff1 fs3 fc0 sc0 ls0 ws0">I<span class="_ _12"></span>.<span class="_ _11"> </span>I<span class="_ _12"></span><span class="fs2">N<span class="_ _12"></span>T<span class="_ _12"></span>RO<span class="_ _12"></span>D<span class="_ _12"></span>U<span class="_ _12"></span>C<span class="_ _12"></span>T<span class="_ _12"></span>I<span class="_ _12"></span>O<span class="_ _12"></span>N</span></div><div class="t m0 x19 h5 y26 ff1 fs3 fc0 sc0 ls0 ws0">Compared<span class="_ _a"> </span>with<span class="_ _b"> </span>password-based<span class="_ _a"> </span>authentication,<span class="_ _b"> </span>voice<span class="_ _a"> </span>au-</div><div class="t m0 x1 h5 y27 ff1 fs3 fc0 sc0 ls0 ws0">thentication<span class="_"> </span>is<span class="_ _f"> </span>more<span class="_ _f"> </span>con<span class="_ _6"></span>venient<span class="_"> </span>since<span class="_ _f"> </span>it<span class="_ _f"> </span>is<span class="_"> </span>hands-free<span class="_ _f"> </span>and<span class="_ _f"> </span>users</div><div class="t m0 x1 h5 y28 ff1 fs3 fc0 sc0 ls0 ws0">do<span class="_ _f"> </span>not<span class="_ _d"> </span>need<span class="_ _f"> </span>to<span class="_ _d"> </span>memorize<span class="_ _f"> </span>passwords.<span class="_ _f"> </span>In<span class="_ _f"> </span>recent<span class="_ _d"> </span>years,<span class="_ _f"> </span>the<span class="_ _d"> </span>rapid</div><div class="t m0 x1 h5 y29 ff1 fs3 fc0 sc0 ls0 ws0">growth<span class="_ _13"> </span>of<span class="_"> </span>mobile<span class="_"> </span>communications<span class="_"> </span>has<span class="_ _13"> </span>boosted<span class="_"> </span>the<span class="_"> </span>use<span class="_"> </span>of<span class="_"> </span>v<span class="_ _6"></span>oice</div><div class="t m0 x1 h5 y2a ff1 fs3 fc0 sc0 ls0 ws0">authentication<span class="_ _d"> </span>in<span class="_ _5"> </span>mobile<span class="_ _d"> </span>devices,<span class="_ _d"> </span>including<span class="_ _d"> </span>smartphone<span class="_ _5"> </span>login,</div><div class="t m0 x1 h5 y2b ff1 fs3 fc0 sc0 ls0 ws0">mobile<span class="_ _f"> </span>banking<span class="_ _d"> </span>and<span class="_ _f"> </span>e-commerce.<span class="_ _f"> </span>For<span class="_ _f"> </span>example,<span class="_ _f"> </span>Google<span class="_ _d"> </span>allo<span class="_ _6"></span>ws</div><div class="t m0 x1 h5 y2c ff1 fs3 fc0 sc0 ls0 ws0">users<span class="_ _d"> </span>to<span class="_ _d"> </span>unlock<span class="_ _d"> </span>their<span class="_ _d"> </span>phones<span class="_ _d"> </span>of<span class="_ _d"> </span>Android<span class="_ _d"> </span>operating<span class="_ _d"> </span>systems<span class="_ _d"> </span>by</div><div class="t m0 x1 h5 y2d ff1 fs3 fc0 sc0 ls0 ws0">voice<span class="_ _f"> </span>biometrics<span class="_ _d"> </span>[1].<span class="_ _d"> </span>Say<span class="_ _d"> </span>T<span class="_ _4"></span>ec<span class="_ _d"> </span>uses<span class="_ _d"> </span>v<span class="_ _6"></span>oice<span class="_ _d"> </span>biometric<span class="_ _d"> </span>solution<span class="_ _d"> </span>to</div><div class="t m0 x1 h5 y2e ff1 fs3 fc0 sc0 ls0 ws0">support<span class="_ _d"> </span>mobile<span class="_ _5"> </span>financial<span class="_ _5"> </span>services<span class="_ _d"> </span>such<span class="_ _5"> </span>as<span class="_ _d"> </span>online<span class="_ _5"> </span>payment<span class="_ _5"> </span>and</div><div class="t m0 x1 h5 y2f ff1 fs3 fc0 sc0 ls0 ws0">banking<span class="_ _5"> </span>[2].</div><div class="t m0 x19 h5 y30 ff1 fs3 fc0 sc0 ls0 ws0">Howe<span class="_ _6"></span>ver<span class="_ _6"></span>,<span class="_ _e"> </span>since<span class="_ _e"> </span>the<span class="_ _10"> </span>sound<span class="_ _e"> </span>transmits<span class="_ _e"> </span>through<span class="_ _e"> </span>an<span class="_ _e"> </span>open<span class="_ _e"> </span>and</div><div class="t m0 x1 h5 y31 ff1 fs3 fc0 sc0 ls0 ws0">public<span class="_ _d"> </span>channel,<span class="_ _5"> </span>the<span class="_ _d"> </span>voice<span class="_ _d"> </span>authentication<span class="_ _5"> </span>system<span class="_ _d"> </span>is<span class="_ _5"> </span>highly<span class="_ _5"> </span>vul-</div><div class="t m0 x1 h5 y32 ff1 fs3 fc0 sc0 ls0 ws0">nerable<span class="_ _f"> </span>to<span class="_ _d"> </span>spoofing<span class="_ _f"> </span>attacks<span class="_ _d"> </span>[3]–[5].<span class="_ _f"> </span>There<span class="_ _d"> </span>are<span class="_ _d"> </span>tw<span class="_ _6"></span>o<span class="_ _d"> </span>major<span class="_ _d"> </span>types</div><div class="t m0 x1 h5 y33 ff1 fs3 fc0 sc0 ls0 ws0">of<span class="_ _5"> </span>spoofing<span class="_ _5"> </span>attacks,<span class="_ _5"> </span>namely<span class="_ _3"> </span>replay<span class="_ _5"> </span>attacks<span class="_ _5"> </span>and<span class="_ _5"> </span>impersonation</div><div class="t m0 x1 h5 y34 ff1 fs3 fc0 sc0 ls0 ws0">attacks<span class="_ _e"> </span>[6].<span class="_ _10"> </span>In<span class="_ _e"> </span>replay<span class="_ _e"> </span>attacks,<span class="_ _10"> </span>the<span class="_ _e"> </span>adversary<span class="_ _10"> </span>pre-records<span class="_ _e"> </span>and</div><div class="t m0 x1 h5 y35 ff1 fs3 fc0 sc0 ls0 ws0">playbacks<span class="_ _5"> </span>the<span class="_ _3"> </span>voice<span class="_ _5"> </span>sample<span class="_ _3"> </span>of<span class="_ _3"> </span>the<span class="_ _5"> </span>passphrase<span class="_ _3"> </span>of<span class="_ _3"> </span>a<span class="_ _3"> </span>leg<span class="_ _6"></span>al<span class="_ _3"> </span>user</div><div class="t m0 x1 h5 y36 ff1 fs3 fc0 sc0 ls0 ws0">to<span class="_ _e"> </span>deceive<span class="_ _e"> </span>the<span class="_ _a"> </span>authentication<span class="_ _e"> </span>system<span class="_ _a"> </span>[7].<span class="_ _a"> </span>An<span class="_ _a"> </span>adversary<span class="_ _e"> </span>can</div><div class="t m0 x1 h5 y37 ff1 fs3 fc0 sc0 ls0 ws0">also<span class="_ _5"> </span>mimic<span class="_ _3"> </span>the<span class="_ _3"> </span>voice<span class="_ _5"> </span>characteristics<span class="_ _3"> </span>and<span class="_ _5"> </span>style<span class="_ _3"> </span>of<span class="_ _3"> </span>a<span class="_ _3"> </span>leg<span class="_ _6"></span>al<span class="_ _3"> </span>user</div><div class="t m0 x1 h5 y38 ff1 fs3 fc0 sc0 ls0 ws0">to<span class="_ _e"> </span>conduct<span class="_ _e"> </span>impersonation<span class="_ _e"> </span>attacks<span class="_ _e"> </span>[8].<span class="_ _a"> </span>Spoofing<span class="_ _e"> </span>attacks<span class="_ _e"> </span>may</div><div class="t m0 x1 h5 y39 ff1 fs3 fc0 sc0 ls0 ws0">greatly<span class="_ _f"> </span>harm<span class="_ _f"> </span>the<span class="_ _d"> </span>users<span class="_ _f"> </span>as<span class="_ _f"> </span>the<span class="_ _d"> </span>adv<span class="_ _6"></span>ersary<span class="_ _d"> </span>may<span class="_ _f"> </span>gain<span class="_ _f"> </span>access<span class="_ _f"> </span>to<span class="_ _d"> </span>the</div><div class="t m0 x1b h5 yf ff1 fs3 fc0 sc0 ls0 ws0">victim’<span class="_ _6"></span>s<span class="_ _5"> </span>smartphone<span class="_ _3"> </span>to<span class="_ _5"> </span>steal<span class="_ _3"> </span>priv<span class="_ _6"></span>ate<span class="_ _5"> </span>information<span class="_ _3"> </span>and<span class="_ _3"> </span>perform</div><div class="t m0 x1b h5 y3a ff1 fs3 fc0 sc0 ls0 ws0">malicious<span class="_ _5"> </span>operations.</div><div class="t m0 x1c h5 y3b ff1 fs3 fc0 sc0 ls0 ws0">T<span class="_ _6"></span>raditional<span class="_ _a"> </span>methods<span class="_ _a"> </span>to<span class="_ _e"> </span>defend<span class="_ _a"> </span>against<span class="_ _e"> </span>replay<span class="_ _a"> </span>attacks<span class="_ _a"> </span>and</div><div class="t m0 x1b h5 y3c ff1 fs3 fc0 sc0 ls0 ws0">impersonation<span class="_ _b"> </span>attacks<span class="_ _11"> </span>are<span class="_ _b"> </span>liveness<span class="_ _b"> </span>detection<span class="_ _b"> </span>and<span class="_ _11"> </span>automatic</div><div class="t m0 x1b h5 y3d ff1 fs3 fc0 sc0 ls0 ws0">speaker<span class="_ _5"> </span>verification<span class="_ _3"> </span>(ASV)<span class="_ _5"> </span>system.<span class="_ _3"> </span>Liv<span class="_ _6"></span>eness<span class="_ _3"> </span>detection<span class="_ _3"> </span>exam-</div><div class="t m0 x1b h5 y15 ff1 fs3 fc0 sc0 ls0 ws0">ines<span class="_ _f"> </span>whether<span class="_ _d"> </span>the<span class="_ _f"> </span>voice<span class="_ _f"> </span>is<span class="_ _f"> </span>produced<span class="_ _d"> </span>by<span class="_ _f"> </span>a<span class="_ _f"> </span>live<span class="_"> </span>user<span class="_ _d"> </span>or<span class="_ _f"> </span>a<span class="_ _d"> </span>speaker<span class="_ _6"></span>,</div><div class="t m0 x1b h5 y3e ff1 fs3 fc0 sc0 ls0 ws0">and<span class="_"> </span>ASV<span class="_ _13"> </span>le<span class="_ _6"></span>verages<span class="_"> </span>unique<span class="_ _13"> </span>spectral<span class="_"> </span>and<span class="_ _13"> </span>prosodic<span class="_ _13"> </span>features<span class="_"> </span>of<span class="_ _13"> </span>the</div><div class="t m0 x1b h5 y3f ff1 fs3 fc0 sc0 ls0 ws0">user’<span class="_ _6"></span>s<span class="_ _f"> </span>voice<span class="_ _c"> </span>for<span class="_ _d"> </span>identity<span class="_ _f"> </span>authentication.<span class="_ _f"> </span>For<span class="_ _f"> </span>example,<span class="_ _f"> </span>Zhang<span class="_ _f"> </span><span class="ff5">et</span></div><div class="t m0 x1b h5 y40 ff5 fs3 fc0 sc0 ls0 ws0">al.<span class="_ _5"> </span><span class="ff1">[9]<span class="_ _3"> </span>proposed<span class="_ _5"> </span>to<span class="_ _3"> </span>capture<span class="_ _5"> </span>time-difference-of-arri<span class="_ _6"></span>val<span class="_ _5"> </span>(TDoA)</span></div><div class="t m0 x1b h5 y41 ff1 fs3 fc0 sc0 ls0 ws0">changes<span class="_ _3"> </span>to<span class="_ _3"> </span>the<span class="_ _10"> </span>two<span class="_ _3"> </span>microphones<span class="_ _3"> </span>of<span class="_ _3"> </span>the<span class="_ _3"> </span>phone<span class="_ _10"> </span>in<span class="_ _3"> </span>a<span class="_ _3"> </span>sequence</div><div class="t m0 x1b h5 y1b ff1 fs3 fc0 sc0 ls0 ws0">of<span class="_ _d"> </span>phoneme<span class="_ _5"> </span>sounds<span class="_ _5"> </span>to<span class="_ _d"> </span>differentiate<span class="_ _d"> </span>the<span class="_ _5"> </span>voice<span class="_ _d"> </span>from<span class="_ _5"> </span>a<span class="_ _d"> </span>live<span class="_ _d"> </span>user</div><div class="t m0 x1b h5 y42 ff1 fs3 fc0 sc0 ls0 ws0">and<span class="_ _3"> </span>a<span class="_ _3"> </span>replay<span class="_ _3"> </span>device,<span class="_ _3"> </span>but<span class="_ _3"> </span>the<span class="_ _3"> </span>user<span class="_ _3"> </span>has<span class="_ _3"> </span>to<span class="_ _10"> </span>hold<span class="_ _3"> </span>the<span class="_ _3"> </span>phone<span class="_ _3"> </span>at<span class="_ _10"> </span>a</div><div class="t m0 x1b h5 y43 ff1 fs3 fc0 sc0 ls0 ws0">specific<span class="_ _d"> </span>position.<span class="_ _d"> </span>In<span class="_ _d"> </span>[10],<span class="_ _d"> </span>the<span class="_ _d"> </span>smartphone<span class="_ _d"> </span>served<span class="_ _d"> </span>as<span class="_ _d"> </span>a<span class="_ _d"> </span>Doppler</div><div class="t m0 x1b h5 y44 ff1 fs3 fc0 sc0 ls0 ws0">radar<span class="_ _e"> </span>to<span class="_ _e"> </span>transmit<span class="_ _a"> </span>a<span class="_ _e"> </span>high-frequency<span class="_ _e"> </span>acoustic<span class="_ _e"> </span>sound<span class="_ _e"> </span>from<span class="_ _a"> </span>the</div><div class="t m0 x1b h5 y45 ff1 fs3 fc0 sc0 ls0 ws0">built-in<span class="_ _b"> </span>speaker<span class="_ _11"> </span>and<span class="_ _11"> </span>monitor<span class="_ _b"> </span>the<span class="_ _11"> </span>reflections<span class="_ _11"> </span>of<span class="_ _11"> </span>articulators</div><div class="t m0 x1b h5 y46 ff1 fs3 fc0 sc0 ls0 ws0">at<span class="_ _e"> </span>the<span class="_ _e"> </span>microphone<span class="_ _e"> </span>for<span class="_ _e"> </span>liv<span class="_ _6"></span>eness<span class="_ _e"> </span>detection.<span class="_ _e"> </span>Unfortunately<span class="_ _6"></span>,<span class="_ _e"> </span>the</div><div class="t m0 x1b h5 y47 ff1 fs3 fc0 sc0 ls0 ws0">extent<span class="_ _3"> </span>of<span class="_ _10"> </span>articulatory<span class="_ _10"> </span>movements<span class="_ _3"> </span>affects<span class="_ _3"> </span>the<span class="_ _10"> </span>effecti<span class="_ _6"></span>veness<span class="_ _3"> </span>of</div><div class="t m0 x1b h5 y48 ff1 fs3 fc0 sc0 ls0 ws0">this<span class="_ _3"> </span>countermeasure.<span class="_ _10"> </span>Chen<span class="_ _10"> </span><span class="ff5">et<span class="_ _10"> </span>al.<span class="_ _3"> </span></span>[11]<span class="_ _10"> </span>explored<span class="_ _3"> </span>the<span class="_ _10"> </span>magnetic</div><div class="t m0 x1b h5 y49 ff1 fs3 fc0 sc0 ls0 ws0">field<span class="_ _d"> </span>emitted<span class="_ _f"> </span>from<span class="_ _d"> </span>loudspeakers<span class="_ _f"> </span>to<span class="_ _d"> </span>detect<span class="_ _f"> </span>voice<span class="_ _d"> </span>replay<span class="_ _f"> </span>attacks.</div><div class="t m0 x1b h5 y4a ff1 fs3 fc0 sc0 ls0 ws0">Howe<span class="_ _6"></span>ver<span class="_ _6"></span>,<span class="_"> </span>users<span class="_ _13"> </span>need<span class="_"> </span>to<span class="_"> </span>mo<span class="_ _6"></span>ve<span class="_"> </span>the<span class="_"> </span>smartphone<span class="_ _13"> </span>with<span class="_"> </span>a<span class="_ _13"> </span>predefined</div><div class="t m0 x1b h5 y4b ff1 fs3 fc0 sc0 ls0 ws0">trajectory<span class="_ _e"> </span>around<span class="_ _e"> </span>the<span class="_ _e"> </span>mouth<span class="_ _a"> </span>while<span class="_ _e"> </span>speaking<span class="_ _e"> </span>the<span class="_ _e"> </span>passphrase.</div><div class="t m0 x1b h5 y4c ff1 fs3 fc0 sc0 ls0 ws0">M<span class="_ _e"> </span>Sahidullah<span class="_ _10"> </span><span class="ff5">et<span class="_ _e"> </span>al.<span class="_ _e"> </span></span>[12]<span class="_ _10"> </span>developed<span class="_ _10"> </span>an<span class="_ _e"> </span>ASV<span class="_ _10"> </span>system<span class="_ _e"> </span>against</div><div class="t m0 x1b h5 y4d ff1 fs3 fc0 sc0 ls0 ws0">impersonation<span class="_ _10"> </span>attacks<span class="_ _e"> </span>using<span class="_ _e"> </span>the<span class="_ _10"> </span>throat<span class="_ _e"> </span>microphone<span class="_ _e"> </span>which<span class="_ _10"> </span>is</div><div class="t m0 x1b h5 y4e ff1 fs3 fc0 sc0 ls0 ws0">not<span class="_ _5"> </span>av<span class="_ _6"></span>ailable<span class="_ _5"> </span>in<span class="_ _5"> </span>most<span class="_ _5"> </span>smartphones.</div><div class="t m0 x1c h5 y4f ff1 fs3 fc0 sc0 ls0 ws0">In<span class="_ _d"> </span>this<span class="_ _f"> </span>paper<span class="_ _6"></span>,<span class="_ _d"> </span>we<span class="_ _d"> </span>propose<span class="_ _f"> </span>and<span class="_ _d"> </span>implement<span class="_ _d"> </span>V<span class="_ _7"></span>oicePop,<span class="_ _f"> </span>a<span class="_ _d"> </span>no<span class="_ _6"></span>vel</div><div class="t m0 x1b h5 y50 ff1 fs3 fc0 sc0 ls0 ws0">and<span class="_ _5"> </span>practical<span class="_ _5"> </span>anti-spoofing<span class="_ _5"> </span>system<span class="_ _5"> </span>based<span class="_ _3"> </span>on<span class="_ _5"> </span><span class="ff5">pop<span class="_ _5"> </span>noise<span class="_ _5"> </span></span>that<span class="_ _5"> </span>is</div><div class="t m0 x1b h5 y51 ff1 fs3 fc0 sc0 ls0 ws0">induced<span class="_ _3"> </span>by<span class="_ _5"> </span>the<span class="_ _3"> </span>user<span class="_ _3"> </span>breathing<span class="_ _3"> </span>while<span class="_ _3"> </span>speaking<span class="_ _3"> </span>the<span class="_ _3"> </span>passphrase</div><div class="t m0 x1b h5 y52 ff1 fs3 fc0 sc0 ls0 ws0">close<span class="_ _3"> </span>to<span class="_ _3"> </span>the<span class="_ _3"> </span>microphone.<span class="_ _10"> </span>The<span class="_ _3"> </span>recorded<span class="_ _3"> </span>voice<span class="_ _3"> </span>samples<span class="_ _3"> </span>hardly</div><div class="t m0 x1b h5 y53 ff1 fs3 fc0 sc0 ls0 ws0">contain<span class="_ _b"> </span>the<span class="_ _14"> </span>pop<span class="_ _b"> </span>noise<span class="_ _14"> </span>since<span class="_ _b"> </span>the<span class="_ _14"> </span>sound<span class="_ _b"> </span>of<span class="_ _14"> </span>breath<span class="_ _b"> </span>is<span class="_ _14"> </span>gentle</div><div class="t m0 x1b h5 y54 ff1 fs3 fc0 sc0 ls0 ws0">compared<span class="_ _a"> </span>to<span class="_ _a"> </span>the<span class="_ _a"> </span>speech<span class="_ _a"> </span>and<span class="_ _a"> </span>will<span class="_ _a"> </span>die<span class="_ _a"> </span>out<span class="_ _a"> </span>beyond<span class="_ _a"> </span>a<span class="_ _a"> </span>certain</div><div class="t m0 x1b h5 y55 ff1 fs3 fc0 sc0 ls0 ws0">distance.<span class="_ _d"> </span>The<span class="_ _d"> </span>pop<span class="_ _d"> </span>noise<span class="_ _d"> </span>is<span class="_ _d"> </span>also<span class="_ _d"> </span>subject<span class="_ _d"> </span>to<span class="_ _d"> </span>user<span class="_ _d"> </span>di<span class="_ _6"></span>versity<span class="_ _d"> </span>and<span class="_ _d"> </span>it</div><div class="t m0 x1b h5 y30 ff1 fs3 fc0 sc0 ls0 ws0">is<span class="_ _3"> </span>very<span class="_ _5"> </span>difficult<span class="_ _3"> </span>for<span class="_ _3"> </span>attackers<span class="_ _3"> </span>to<span class="_ _3"> </span>imitate<span class="_ _3"> </span>the<span class="_ _3"> </span>way<span class="_ _3"> </span>of<span class="_ _3"> </span>breathing</div><div class="t m0 x1b h5 y56 ff1 fs3 fc0 sc0 ls0 ws0">of<span class="_ _b"> </span>the<span class="_ _14"> </span>legal<span class="_ _b"> </span>user<span class="_ _6"></span>.<span class="_ _b"> </span>These<span class="_ _14"> </span>ideal<span class="_ _14"> </span>properties<span class="_ _b"> </span>of<span class="_ _14"> </span>the<span class="_ _14"> </span>pop<span class="_ _b"> </span>noise</div><div class="t m0 x1b h5 y32 ff1 fs3 fc0 sc0 ls0 ws0">enable<span class="_"> </span>our<span class="_ _13"> </span>proposed<span class="_"> </span>V<span class="_ _7"></span>oicePop<span class="_ _13"> </span>system<span class="_"> </span>to<span class="_ _13"> </span>resist<span class="_"> </span>spoofing<span class="_ _13"> </span>attacks</div><div class="t m0 x1b h5 y33 ff1 fs3 fc0 sc0 ls0 ws0">in<span class="_ _3"> </span>voice<span class="_ _5"> </span>authentication.<span class="_ _3"> </span>T<span class="_ _6"></span>o<span class="_ _3"> </span>begin<span class="_ _5"> </span>with,<span class="_ _3"> </span>we<span class="_ _3"> </span>conduct<span class="_ _3"> </span>phoneme</div><div class="t m0 x1b h5 y34 ff1 fs3 fc0 sc0 ls0 ws0">segmentation<span class="_"> </span>on<span class="_ _f"> </span>the<span class="_ _f"> </span>collected<span class="_ _f"> </span>voice<span class="_ _f"> </span>sample<span class="_ _f"> </span>according<span class="_ _f"> </span>to<span class="_ _f"> </span>spec-</div><div class="t m0 x1b h5 y35 ff1 fs3 fc0 sc0 ls0 ws0">trogram<span class="_ _f"> </span>characteristics.<span class="_ _f"> </span>W<span class="_ _4"></span>e<span class="_ _f"> </span>design<span class="_ _f"> </span>a<span class="_ _f"> </span>novel<span class="_"> </span>pop<span class="_ _f"> </span>noise<span class="_ _f"> </span>detection</div><div class="t m0 x1b h5 y36 ff1 fs3 fc0 sc0 ls0 ws0">algorithm<span class="_"> </span>to<span class="_"> </span>locate<span class="_ _f"> </span>pop<span class="_"> </span>noises<span class="_ _f"> </span>at<span class="_"> </span>the<span class="_ _f"> </span>phonemic<span class="_"> </span>level.<span class="_"> </span>A<span class="_"> </span>lack<span class="_"> </span>of</div><div class="t m0 x1b h5 y57 ff1 fs3 fc0 sc0 ls0 ws0">pop<span class="_"> </span>noise<span class="_ _13"> </span>is<span class="_ _13"> </span>deemed<span class="_"> </span>as<span class="_ _13"> </span>a<span class="_"> </span>replay<span class="_ _13"> </span>attack,<span class="_ _13"> </span>ho<span class="_ _6"></span>wever<span class="_ _6"></span>,<span class="_ _13"> </span>en<span class="_ _6"></span>vironmental</div><div class="t m0 x1b h5 y38 ff1 fs3 fc0 sc0 ls0 ws0">noise<span class="_"> </span>and<span class="_"> </span>hardware<span class="_"> </span>noise<span class="_"> </span>may<span class="_"> </span>also<span class="_"> </span>be<span class="_"> </span>wrongly<span class="_ _f"> </span>detected<span class="_"> </span>as<span class="_"> </span>pop</div><div class="t m0 x1b h5 y39 ff1 fs3 fc0 sc0 ls0 ws0">noises<span class="_ _f"> </span>in<span class="_ _f"> </span>the<span class="_ _d"> </span>replayed<span class="_"> </span>voice<span class="_ _f"> </span>samples.<span class="_ _f"> </span>T<span class="_ _6"></span>o<span class="_ _f"> </span>address<span class="_ _f"> </span>this<span class="_ _f"> </span>problem,</div></div><div class="pi" data-data='{"ctm":[1.568627,0.000000,0.000000,1.568627,0.000000,0.000000]}'></div></div>
</body>
</html>
<div id="pf2" class="pf w0 h0" data-page-no="2"><div class="pc pc2 w0 h0"><img class="bi x0 y0 w1 h1" alt="" src="https://static.pudn.com/prod/directory_preview_static/625b94e3be9ad24cfa1d841d/bg2.jpg"><div class="t m0 x1 h5 y58 ff1 fs3 fc0 sc0 ls0 ws0">we<span class="_ _e"> </span>extract<span class="_ _a"> </span>the<span class="_ _e"> </span>Gammatone<span class="_ _a"> </span>Frequency<span class="_ _e"> </span>Cepstral<span class="_ _a"> </span>Coefficients</div><div class="t m0 x1 h5 y59 ff1 fs3 fc0 sc0 ls0 ws0">(GFCC)<span class="_ _5"> </span>features<span class="_ _3"> </span>of<span class="_ _3"> </span>the<span class="_ _5"> </span>detected<span class="_ _3"> </span>pop<span class="_ _3"> </span>noises<span class="_ _5"> </span>for<span class="_ _3"> </span>classification</div><div class="t m0 x1 h5 y5a ff1 fs3 fc0 sc0 ls0 ws0">to<span class="_ _a"> </span>distinguish<span class="_ _a"> </span>a<span class="_ _a"> </span>genuine<span class="_ _a"> </span>voice<span class="_ _e"> </span>sample<span class="_ _a"> </span>and<span class="_ _a"> </span>a<span class="_ _a"> </span>replayed<span class="_ _a"> </span>one.</div><div class="t m0 x1 h5 y5b ff1 fs3 fc0 sc0 ls0 ws0">T<span class="_ _4"></span>o<span class="_ _9"> </span>defend<span class="_ _11"> </span>against<span class="_ _9"> </span>impersonation<span class="_ _11"> </span>attacks,<span class="_ _9"> </span>we<span class="_ _9"> </span>lev<span class="_ _6"></span>erage<span class="_ _9"> </span>the</div><div class="t m0 x1 h5 y5c ff1 fs3 fc0 sc0 ls0 ws0">individually<span class="_ _10"> </span>unique<span class="_ _10"> </span>relationship<span class="_ _e"> </span>between<span class="_ _10"> </span>phonemes<span class="_ _e"> </span>and<span class="_ _10"> </span>pop</div><div class="t m0 x1 h5 y5d ff1 fs3 fc0 sc0 ls0 ws0">noises<span class="_ _3"> </span>to<span class="_ _5"> </span>construct<span class="_ _3"> </span>a<span class="_ _3"> </span>phoneme-pop<span class="_ _3"> </span>sequence.<span class="_ _3"> </span>A<span class="_ _3"> </span>legal<span class="_ _5"> </span>user<span class="_ _3"> </span>is</div><div class="t m0 x1 h5 y5e ff1 fs3 fc0 sc0 ls0 ws0">accepted<span class="_ _d"> </span>if<span class="_ _5"> </span>the<span class="_ _d"> </span>phoneme-pop<span class="_ _5"> </span>sequence<span class="_ _d"> </span>of<span class="_ _5"> </span>the<span class="_ _d"> </span>voice<span class="_ _d"> </span>sample<span class="_ _d"> </span>is</div><div class="t m0 x1 h5 y5f ff1 fs3 fc0 sc0 ls0 ws0">similar<span class="_ _d"> </span>to<span class="_ _d"> </span>that<span class="_ _d"> </span>stored<span class="_ _d"> </span>in<span class="_ _d"> </span>the<span class="_ _d"> </span>user<span class="_ _d"> </span>profile<span class="_ _d"> </span>upon<span class="_ _d"> </span>registration,<span class="_ _d"> </span>and</div><div class="t m0 x1 h5 y60 ff1 fs3 fc0 sc0 ls0 ws0">an<span class="_ _5"> </span>impersonation<span class="_ _5"> </span>attack<span class="_ _5"> </span>is<span class="_ _5"> </span>declared<span class="_ _5"> </span>otherwise.</div><div class="t m0 x19 h5 y61 ff1 fs3 fc0 sc0 ls0 ws0">V<span class="_ _7"></span>oicePop<span class="_ _11"> </span>requires<span class="_ _9"> </span>no<span class="_ _9"> </span>additional<span class="_ _9"> </span>hardware<span class="_ _9"> </span>but<span class="_ _11"> </span>only<span class="_ _9"> </span>the</div><div class="t m0 x1 h5 y62 ff1 fs3 fc0 sc0 ls0 ws0">built-in<span class="_ _3"> </span>microphones<span class="_ _10"> </span>that<span class="_ _10"> </span>are<span class="_ _e"> </span>a<span class="_ _6"></span>vailable<span class="_ _10"> </span>on<span class="_ _10"> </span>almost<span class="_ _10"> </span>all<span class="_ _10"> </span>mobile</div><div class="t m0 x1 h5 y63 ff1 fs3 fc0 sc0 ls0 ws0">devices.<span class="_ _3"> </span>V<span class="_ _7"></span>oicePop<span class="_ _10"> </span>also<span class="_ _10"> </span>demands<span class="_ _e"> </span>no<span class="_ _10"> </span>extra<span class="_ _10"> </span>efforts<span class="_ _3"> </span>from<span class="_ _e"> </span>users</div><div class="t m0 x1 h5 y64 ff1 fs3 fc0 sc0 ls0 ws0">except<span class="_ _10"> </span>to<span class="_ _10"> </span>speak<span class="_ _10"> </span>the<span class="_ _10"> </span>passphrase<span class="_ _e"> </span>as<span class="_ _10"> </span>required<span class="_ _10"> </span>by<span class="_ _10"> </span>current<span class="_ _e"> </span>voice</div><div class="t m0 x1 h5 y65 ff1 fs3 fc0 sc0 ls0 ws0">authentication<span class="_ _10"> </span>systems.<span class="_ _e"> </span>As<span class="_ _10"> </span>far<span class="_ _e"> </span>as<span class="_ _10"> </span>we<span class="_ _e"> </span>are<span class="_ _10"> </span>concerned,<span class="_ _e"> </span>we<span class="_ _10"> </span>are</div><div class="t m0 x1 h5 y66 ff1 fs3 fc0 sc0 ls0 ws0">the<span class="_ _d"> </span>first<span class="_ _f"> </span>to<span class="_ _d"> </span>use<span class="_ _d"> </span>the<span class="_ _d"> </span>features<span class="_ _f"> </span>of<span class="_ _d"> </span>pop<span class="_ _d"> </span>noise<span class="_ _d"> </span>to<span class="_ _f"> </span>defend<span class="_ _d"> </span>both<span class="_ _d"> </span>replay</div><div class="t m0 x1 h5 y67 ff1 fs3 fc0 sc0 ls0 ws0">attacks<span class="_"> </span>and<span class="_"> </span>impersonation<span class="_ _f"> </span>attacks.<span class="_"> </span>W<span class="_ _6"></span>e<span class="_"> </span>implement<span class="_"> </span>V<span class="_ _7"></span>oicePop<span class="_"> </span>on</div><div class="t m0 x1 h5 y68 ff1 fs3 fc0 sc0 ls0 ws0">3<span class="_ _5"> </span>types<span class="_ _5"> </span>of<span class="_ _5"> </span>smartphones<span class="_ _5"> </span>and<span class="_ _3"> </span>e<span class="_ _6"></span>valuate<span class="_ _5"> </span>its<span class="_ _5"> </span>performance<span class="_ _5"> </span>with<span class="_ _5"> </span>18</div><div class="t m0 x1 h5 y69 ff1 fs3 fc0 sc0 ls0 ws0">volunteers<span class="_ _10"> </span>under<span class="_ _e"> </span>different<span class="_ _10"> </span>experimental<span class="_ _10"> </span>settings.<span class="_ _e"> </span>The<span class="_ _e"> </span>results</div><div class="t m0 x1 h5 y6a ff1 fs3 fc0 sc0 ls0 ws0">verify<span class="_ _d"> </span>the<span class="_ _d"> </span>ef<span class="_ _6"></span>fectiv<span class="_ _6"></span>eness<span class="_ _5"> </span>of<span class="_ _d"> </span>V<span class="_ _7"></span>oicePop<span class="_ _d"> </span>that<span class="_ _d"> </span>achie<span class="_ _6"></span>ves<span class="_ _d"> </span>ov<span class="_ _6"></span>er<span class="_ _d"> </span>93.5%</div><div class="t m0 x1 h5 y6b ff1 fs3 fc0 sc0 ls0 ws0">detection<span class="_ _d"> </span>accuracy<span class="_ _d"> </span>at<span class="_ _5"> </span>around<span class="_ _d"> </span>5.4%<span class="_ _5"> </span>equal<span class="_ _d"> </span>error<span class="_ _d"> </span>rate.<span class="_ _5"> </span>The<span class="_ _d"> </span>main</div><div class="t m0 x1 h5 y6c ff1 fs3 fc0 sc0 ls0 ws0">contributions<span class="_ _d"> </span>of<span class="_ _5"> </span>this<span class="_ _5"> </span>work<span class="_ _5"> </span>are<span class="_ _5"> </span>summarized<span class="_ _5"> </span>as<span class="_ _5"> </span>follows:</div><div class="t m0 x19 h5 y6d ff6 fs5 fc0 sc0 ls0 ws0">•<span class="_ _9"> </span><span class="ff1 fs3">W<span class="_ _4"></span>e<span class="_ _d"> </span>propose<span class="_ _f"> </span>V<span class="_ _7"></span>oicePop,<span class="_ _d"> </span>a<span class="_ _f"> </span>practical<span class="_ _d"> </span>and<span class="_ _d"> </span>ef<span class="_ _6"></span>fectiv<span class="_ _6"></span>e<span class="_ _d"> </span>software-</span></div><div class="t m0 x1d h5 y6e ff1 fs3 fc0 sc0 ls0 ws0">only<span class="_ _5"> </span>anti-spoofing<span class="_ _5"> </span>system<span class="_ _5"> </span>for<span class="_ _5"> </span>voice<span class="_ _d"> </span>authentication<span class="_ _5"> </span>based</div><div class="t m0 x1d h5 y6f ff1 fs3 fc0 sc0 ls0 ws0">on<span class="_ _d"> </span>pop<span class="_ _d"> </span>noise,<span class="_ _d"> </span>which<span class="_ _d"> </span>can<span class="_ _d"> </span>be<span class="_ _d"> </span>easily<span class="_ _d"> </span>integrated<span class="_ _d"> </span>in<span class="_ _f"> </span>commer-</div><div class="t m0 x1d h5 y70 ff1 fs3 fc0 sc0 ls0 ws0">cial<span class="_ _5"> </span>off-the-shelf<span class="_ _d"> </span>smartphones.</div><div class="t m0 x19 h5 y71 ff6 fs5 fc0 sc0 ls0 ws0">•<span class="_ _9"> </span><span class="ff1 fs3">W<span class="_ _4"></span>e<span class="_ _a"> </span>design<span class="_ _e"> </span>a<span class="_ _a"> </span>nov<span class="_ _6"></span>el<span class="_ _a"> </span>pop<span class="_ _a"> </span>noise<span class="_ _e"> </span>detection<span class="_ _a"> </span>scheme<span class="_ _a"> </span>to<span class="_ _e"> </span>de-</span></div><div class="t m0 x1d h5 y72 ff1 fs3 fc0 sc0 ls0 ws0">fend<span class="_ _a"> </span>against<span class="_ _b"> </span>replay<span class="_ _b"> </span>spoofing<span class="_ _b"> </span>attacks,<span class="_ _b"> </span>and<span class="_ _b"> </span>lev<span class="_ _6"></span>erage<span class="_ _b"> </span>the</div><div class="t m0 x1d h5 y73 ff1 fs3 fc0 sc0 ls0 ws0">individually<span class="_ _3"> </span>unique<span class="_ _10"> </span>relationship<span class="_ _10"> </span>between<span class="_ _10"> </span>phonemes<span class="_ _10"> </span>and</div><div class="t m0 x1d h5 y74 ff1 fs3 fc0 sc0 ls0 ws0">pop<span class="_ _d"> </span>noises<span class="_ _d"> </span>to<span class="_ _d"> </span>generate<span class="_ _d"> </span>a<span class="_ _d"> </span>phoneme-pop<span class="_ _f"> </span>sequence<span class="_ _d"> </span>to<span class="_ _d"> </span>resist</div><div class="t m0 x1d h5 y75 ff1 fs3 fc0 sc0 ls0 ws0">impersonation<span class="_ _5"> </span>spoofing<span class="_ _5"> </span>attacks.</div><div class="t m0 x19 h5 y76 ff6 fs5 fc0 sc0 ls0 ws0">•<span class="_ _9"> </span><span class="ff1 fs3">W<span class="_ _4"></span>e<span class="_ _14"> </span>build<span class="_ _14"> </span>a<span class="_ _14"> </span>fully-functional<span class="_ _14"> </span>V<span class="_ _7"></span>oicePop<span class="_ _14"> </span>prototype<span class="_ _14"> </span>using</span></div><div class="t m0 x1d h5 y77 ff1 fs3 fc0 sc0 ls0 ws0">off-the-shelf<span class="_ _14"> </span>smartphones.<span class="_ _14"> </span>Extensive<span class="_ _14"> </span>ev<span class="_ _6"></span>aluation<span class="_ _11"> </span>results</div><div class="t m0 x1d h5 y78 ff1 fs3 fc0 sc0 ls0 ws0">demonstrate<span class="_ _a"> </span>that<span class="_ _a"> </span>V<span class="_ _7"></span>oicePop<span class="_ _a"> </span>can<span class="_ _a"> </span>detect<span class="_ _a"> </span>both<span class="_ _b"> </span>replay<span class="_ _a"> </span>and</div><div class="t m0 x1d h5 y79 ff1 fs3 fc0 sc0 ls0 ws0">impersonation<span class="_ _d"> </span>spoofing<span class="_ _d"> </span>attacks<span class="_ _d"> </span>with<span class="_ _d"> </span>a<span class="_ _d"> </span>high<span class="_ _5"> </span>accuracy<span class="_ _d"> </span>and</div><div class="t m0 x1d h5 y7a ff1 fs3 fc0 sc0 ls0 ws0">a<span class="_ _5"> </span>low<span class="_ _d"> </span>equal<span class="_ _5"> </span>error<span class="_ _5"> </span>rate.</div><div class="t m0 x1e h5 y7b ff1 fs3 fc0 sc0 ls0 ws0">I<span class="_ _12"></span>I<span class="_ _12"></span>.<span class="_ _14"> </span>P<span class="_ _12"></span><span class="fs2">R<span class="_ _12"></span>E<span class="_ _12"></span>L<span class="_ _12"></span>I<span class="_ _12"></span>M<span class="_ _12"></span>I<span class="_ _12"></span>NA<span class="_ _12"></span>R<span class="_ _12"></span>I<span class="_ _12"></span>E<span class="_ _12"></span>S</span></div><div class="t m0 x1 h8 y7c ff5 fs3 fc0 sc0 ls0 ws0">A.<span class="_ _a"> </span>Attack<span class="_ _d"> </span>Model</div><div class="t m0 x19 h5 y7d ff1 fs3 fc0 sc0 ls0 ws0">V<span class="_ _7"></span>oice<span class="_"> </span>authentication<span class="_ _f"> </span>system<span class="_ _f"> </span>can<span class="_ _d"> </span>be<span class="_"> </span>text-dependent<span class="_ _f"> </span>(requires</div><div class="t m0 x1 h5 y7e ff1 fs3 fc0 sc0 ls0 ws0">the<span class="_ _a"> </span>same<span class="_ _a"> </span>password<span class="_ _a"> </span>for<span class="_ _a"> </span>enrolment<span class="_ _a"> </span>and<span class="_ _a"> </span>verification)<span class="_ _a"> </span>or<span class="_ _a"> </span>text-</div><div class="t m0 x1 h5 y7f ff1 fs3 fc0 sc0 ls0 ws0">independent<span class="_ _10"> </span>(accept<span class="_ _e"> </span>arbitrary<span class="_ _e"> </span>utterances<span class="_ _e"> </span>from<span class="_ _10"> </span>speakers).<span class="_ _e"> </span>W<span class="_ _4"></span>e</div><div class="t m0 x1 h5 y80 ff1 fs3 fc0 sc0 ls0 ws0">primarily<span class="_ _10"> </span>focus<span class="_ _3"> </span>on<span class="_ _10"> </span>the<span class="_ _10"> </span>text-dependent<span class="_ _3"> </span>authentication<span class="_ _10"> </span>system,</div><div class="t m0 x1 h5 y81 ff1 fs3 fc0 sc0 ls0 ws0">which<span class="_ _d"> </span>is<span class="_ _f"> </span>currently<span class="_ _d"> </span>the<span class="_ _d"> </span>most<span class="_ _d"> </span>widely<span class="_ _d"> </span>adopted<span class="_ _f"> </span>and<span class="_ _d"> </span>commercially</div><div class="t m0 x1 h5 y82 ff1 fs3 fc0 sc0 ls0 ws0">viable<span class="_ _f"> </span>method<span class="_ _f"> </span>with<span class="_ _f"> </span>a<span class="_ _d"> </span>high<span class="_"> </span>authentication<span class="_ _f"> </span>accuracy<span class="_ _f"> </span>[13].<span class="_ _f"> </span>Fig.<span class="_ _d"> </span>1</div><div class="t m0 x1 h5 y83 ff1 fs3 fc0 sc0 ls0 ws0">displays<span class="_ _5"> </span>a<span class="_ _3"> </span>typical<span class="_ _5"> </span>voice<span class="_ _5"> </span>authentication<span class="_ _3"> </span>system.<span class="_ _5"> </span>For<span class="_ _5"> </span>the<span class="_ _3"> </span>attack</div><div class="t m0 x1 h5 y84 ff1 fs3 fc0 sc0 ls0 ws0">model,<span class="_ _f"> </span>we<span class="_ _f"> </span>consider<span class="_ _d"> </span>replay<span class="_ _c"> </span>spoofing<span class="_ _d"> </span>attacks<span class="_ _f"> </span>and<span class="_ _f"> </span>impersonation</div><div class="t m0 x1 h5 y85 ff1 fs3 fc0 sc0 ls0 ws0">spoofing<span class="_ _5"> </span>attacks.</div><div class="t m0 x19 h9 y34 ff4 fs3 fc0 sc0 ls0 ws0">Replay<span class="_ _9"> </span>attacks<span class="ff1">.<span class="_ _14"> </span>Replay<span class="_ _14"> </span>attacks<span class="_ _14"> </span>lev<span class="_ _6"></span>erage<span class="_ _14"> </span>computers<span class="_ _14"> </span>and</span></div><div class="t m0 x1 h5 y35 ff1 fs3 fc0 sc0 ls0 ws0">other<span class="_ _3"> </span>peripheral<span class="_ _10"> </span>de<span class="_ _6"></span>vices<span class="_ _10"> </span>(e.g.,<span class="_ _3"> </span>loudspeaker)<span class="_ _3"> </span>to<span class="_ _10"> </span>perform<span class="_ _3"> </span>voice</div><div class="t m0 x1 h5 y36 ff1 fs3 fc0 sc0 ls0 ws0">playback<span class="_ _e"> </span>to<span class="_ _a"> </span>the<span class="_ _e"> </span>microphone<span class="_ _a"> </span>of<span class="_ _e"> </span>the<span class="_ _a"> </span>smartphone.<span class="_ _e"> </span>The<span class="_ _a"> </span>replay</div><div class="t m0 x1 h5 y37 ff1 fs3 fc0 sc0 ls0 ws0">samples<span class="_"> </span>that<span class="_"> </span>in<span class="_ _6"></span>volve<span class="_"> </span>the<span class="_"> </span>information<span class="_ _13"> </span>of<span class="_"> </span>the<span class="_"> </span>victim’<span class="_ _6"></span>s<span class="_"> </span>passphrase</div><div class="t m0 x1 h5 y38 ff1 fs3 fc0 sc0 ls0 ws0">can<span class="_ _9"> </span>be<span class="_ _8"> </span>produced<span class="_ _9"> </span>by<span class="_ _8"> </span>stealthily<span class="_ _9"> </span>recording,<span class="_ _8"> </span>voice<span class="_ _9"> </span>synthesis,</div><div class="t m0 x1 h5 y39 ff1 fs3 fc0 sc0 ls0 ws0">and<span class="_ _11"> </span>voice<span class="_ _14"> </span>conv<span class="_ _6"></span>ersion.<span class="_ _11"> </span>In<span class="_ _9"> </span>this<span class="_ _11"> </span>paper<span class="_ _6"></span>,<span class="_ _9"> </span>we<span class="_ _11"> </span>mainly<span class="_ _11"> </span>focus<span class="_ _11"> </span>on</div><div class="c x1f y86 w2 ha"><div class="t m0 x20 hb y87 ff7 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">M</span><span class="fc1 sc0">i</span><span class="fc1 sc0">c</span><span class="fc1 sc0">ro</span><span class="fc1 sc0">p</span><span class="fc1 sc0">h</span><span class="fc1 sc0">o</span><span class="fc1 sc0">n</span><span class="fc1 sc0">e</span></div><div class="t m0 x21 hc y88 ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">V</span><span class="fc1 sc0">o</span><span class="fc1 sc0">i</span><span class="fc1 sc0">c</span><span class="fc1 sc0">e</span><span class="fc1 sc0"> </span></div><div class="t m0 x22 hc y89 ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">D</span><span class="fc1 sc0">a</span><span class="fc1 sc0">ta</span></div><div class="t m0 x23 hc y8a ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">U</span><span class="fc1 sc0">ser</span></div><div class="t m0 x24 hb y8b ff7 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">F</span><span class="fc1 sc0">e</span><span class="fc1 sc0">a</span><span class="fc1 sc0">tu</span><span class="fc1 sc0">re </span></div><div class="t m0 x25 hb y8c ff7 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">E</span><span class="fc1 sc0">xtrac</span><span class="fc1 sc0">t</span><span class="fc1 sc0">i</span><span class="fc1 sc0">o</span><span class="fc1 sc0">n</span></div><div class="t m0 x26 hb y87 ff7 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">Cl</span><span class="fc1 sc0">a</span><span class="fc1 sc0">s</span><span class="fc1 sc0">s</span><span class="fc1 sc0">i</span><span class="fc1 sc0">f</span><span class="fc1 sc0">i</span><span class="fc1 sc0">er</span></div><div class="t m0 x27 hc y8b ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">V</span><span class="fc1 sc0">o</span><span class="fc1 sc0">i</span><span class="fc1 sc0">c</span><span class="fc1 sc0">e</span><span class="fc1 sc0"> </span></div><div class="t m0 x28 hc y8c ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">F</span><span class="fc1 sc0">e</span><span class="fc1 sc0">a</span><span class="fc1 sc0">tur</span><span class="fc1 sc0">e</span></div><div class="t m0 x29 hc y8d ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">S</span><span class="fc1 sc0">p</span><span class="fc1 sc0">e</span><span class="fc1 sc0">a</span><span class="fc1 sc0">ker</span><span class="fc1 sc0"> </span><span class="fc1 sc0">M</span><span class="fc1 sc0">o</span><span class="fc1 sc0">d</span><span class="fc1 sc0">e</span><span class="fc1 sc0">l</span></div><div class="t m0 x2a hc y8e ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">D</span><span class="fc1 sc0">ec</span><span class="fc1 sc0">i</span><span class="fc1 sc0">si</span><span class="fc1 sc0">o</span><span class="fc1 sc0">n</span></div><div class="t m0 x2b hc y8f ff8 fs6 fc0 sc0 ls0 ws0"><span class="fc1 sc0">R</span><span class="fc1 sc0">e</span><span class="fc1 sc0">fer</span><span class="fc1 sc0">e</span><span class="fc1 sc0">n</span><span class="fc1 sc0">ce</span></div></div><div class="t m0 x2c hd y90 ff1 fs2 fc0 sc0 ls0 ws0">Fig.<span class="_ _f"> </span>1.<span class="_ _14"> </span>A<span class="_ _f"> </span>typical<span class="_ _f"> </span>voice<span class="_ _f"> </span>authentication<span class="_ _f"> </span>system.</div><div class="t m0 x1b h5 y91 ff1 fs3 fc0 sc0 ls0 ws0">replay<span class="_ _e"> </span>attacks<span class="_ _e"> </span>by<span class="_ _a"> </span>pre-recording<span class="_ _e"> </span>since<span class="_ _e"> </span>they<span class="_ _e"> </span>retain<span class="_ _e"> </span>more<span class="_ _a"> </span>user</div><div class="t m0 x1b h5 y92 ff1 fs3 fc0 sc0 ls0 ws0">characteristics<span class="_"> </span>than<span class="_"> </span>those<span class="_ _13"> </span>generated<span class="_"> </span>by<span class="_"> </span>synthesis<span class="_ _13"> </span>or<span class="_"> </span>con<span class="_ _6"></span>version.</div><div class="t m0 x1c h9 y93 ff4 fs3 fc0 sc0 ls0 ws0">Impersonation<span class="_ _d"> </span>attacks<span class="ff1">.<span class="_ _5"> </span>Impersonation<span class="_ _d"> </span>attacks<span class="_ _d"> </span>can<span class="_ _d"> </span>be<span class="_ _d"> </span>con-</span></div><div class="t m0 x1b h5 y94 ff1 fs3 fc0 sc0 ls0 ws0">ducted<span class="_"> </span>in<span class="_"> </span>two<span class="_"> </span>w<span class="_ _6"></span>ays.<span class="_"> </span>The<span class="_"> </span>first<span class="_"> </span>is<span class="_"> </span>simply<span class="_"> </span>to<span class="_"> </span>imitate<span class="_"> </span>the<span class="_"> </span>le<span class="_ _6"></span>gitimate</div><div class="t m0 x1b h5 y95 ff1 fs3 fc0 sc0 ls0 ws0">user’<span class="_ _6"></span>s<span class="_ _5"> </span>voice<span class="_ _d"> </span>and<span class="_ _5"> </span>speaking<span class="_ _3"> </span>habit<span class="_ _5"> </span>without<span class="_ _5"> </span>the<span class="_ _5"> </span>help<span class="_ _5"> </span>of<span class="_ _5"> </span>other<span class="_ _3"> </span>de-</div><div class="t m0 x1b h5 y96 ff1 fs3 fc0 sc0 ls0 ws0">vices.<span class="_"> </span>The<span class="_ _13"> </span>second<span class="_"> </span>is<span class="_"> </span>more<span class="_ _13"> </span>adv<span class="_ _6"></span>anced<span class="_"> </span>where<span class="_"> </span>we<span class="_"> </span>consider<span class="_ _13"> </span>that<span class="_"> </span>the</div><div class="t m0 x1b h5 y97 ff1 fs3 fc0 sc0 ls0 ws0">attacker<span class="_ _3"> </span>kno<span class="_ _6"></span>ws<span class="_ _3"> </span>the<span class="_ _3"> </span>key<span class="_ _3"> </span>rationale<span class="_ _3"> </span>of<span class="_ _3"> </span>our<span class="_ _3"> </span>anti-spoofing<span class="_ _3"> </span>system</div><div class="t m0 x1b h5 y98 ff1 fs3 fc0 sc0 ls0 ws0">and<span class="_ _3"> </span>observes<span class="_ _5"> </span>how<span class="_ _5"> </span>the<span class="_ _3"> </span>target<span class="_ _5"> </span>user<span class="_ _3"> </span>pronounces<span class="_ _5"> </span>the<span class="_ _3"> </span>passphrase.</div><div class="t m0 x1b h5 y99 ff1 fs3 fc0 sc0 ls0 ws0">T<span class="_ _4"></span>o<span class="_ _d"> </span>perform<span class="_ _d"> </span>this<span class="_ _d"> </span>type<span class="_ _d"> </span>of<span class="_ _d"> </span>attacks,<span class="_ _d"> </span>we<span class="_ _5"> </span>assume<span class="_ _d"> </span>that<span class="_ _d"> </span>the<span class="_ _d"> </span>adversary</div><div class="t m0 x1b h5 y9a ff1 fs3 fc0 sc0 ls0 ws0">uses<span class="_ _e"> </span>a<span class="_ _e"> </span>loudspeaker<span class="_ _10"> </span>to<span class="_ _e"> </span>replay<span class="_ _e"> </span>the<span class="_ _e"> </span>pre-recorded<span class="_ _e"> </span>voice<span class="_ _e"> </span>sample</div><div class="t m0 x1b h5 y9b ff1 fs3 fc0 sc0 ls0 ws0">near<span class="_ _3"> </span>the<span class="_ _5"> </span>microphone<span class="_ _3"> </span>while<span class="_ _3"> </span>simultaneously<span class="_ _3"> </span>impersonating<span class="_ _5"> </span>the</div><div class="t m0 x1b h5 y9c ff1 fs3 fc0 sc0 ls0 ws0">victim’<span class="_ _6"></span>s<span class="_ _5"> </span>breathing<span class="_ _5"> </span>pattern<span class="_ _5"> </span>closely<span class="_ _5"> </span>to<span class="_ _5"> </span>the<span class="_ _5"> </span>microphone.</div><div class="t m0 x1b h8 y9d ff5 fs3 fc0 sc0 ls0 ws0">B.<span class="_ _a"> </span>P<span class="_ _4"></span>op<span class="_ _5"> </span>Noise</div><div class="t m0 x1c h5 y9e ff1 fs3 fc0 sc0 ls0 ws0">The<span class="_ _10"> </span>human<span class="_ _e"> </span>v<span class="_ _6"></span>oice<span class="_ _e"> </span>is<span class="_ _10"> </span>produced<span class="_ _e"> </span>through<span class="_ _10"> </span>sev<span class="_ _6"></span>eral<span class="_ _e"> </span>stages.<span class="_ _10"> </span>Air</div><div class="t m0 x1b h5 y9f ff1 fs3 fc0 sc0 ls0 ws0">is<span class="_ _5"> </span>first<span class="_ _5"> </span>expelled<span class="_ _d"> </span>from<span class="_ _5"> </span>the<span class="_ _5"> </span>lung<span class="_ _5"> </span>to<span class="_ _5"> </span>form<span class="_ _5"> </span>an<span class="_ _5"> </span>airflow<span class="_ _4"></span>,<span class="_ _5"> </span>which<span class="_ _5"> </span>then</div><div class="t m0 x1b h5 ya0 ff1 fs3 fc0 sc0 ls0 ws0">enters<span class="_ _f"> </span>the<span class="_ _f"> </span>throat,<span class="_ _d"> </span>passes<span class="_ _f"> </span>through<span class="_ _f"> </span>the<span class="_ _d"> </span>v<span class="_ _6"></span>ocal<span class="_ _d"> </span>cords<span class="_"> </span>into<span class="_ _d"> </span>the<span class="_ _f"> </span>vocal</div><div class="t m0 x1b h5 ya1 ff1 fs3 fc0 sc0 ls0 ws0">tract,<span class="_ _10"> </span>and<span class="_ _10"> </span>finally<span class="_ _10"> </span>b<span class="_ _6"></span>ursts<span class="_ _10"> </span>out<span class="_ _10"> </span>of<span class="_ _10"> </span>the<span class="_ _10"> </span>mouth<span class="_ _10"> </span>to<span class="_ _10"> </span>form<span class="_ _10"> </span>the<span class="_ _10"> </span>sound</div><div class="t m0 x1b h5 ya2 ff1 fs3 fc0 sc0 ls0 ws0">wa<span class="_ _6"></span>ve.<span class="_ _b"> </span>When<span class="_ _14"> </span>the<span class="_ _b"> </span>resulting<span class="_ _14"> </span>airflow<span class="_ _a"> </span>reaches<span class="_ _14"> </span>the<span class="_ _b"> </span>microphone,</div><div class="t m0 x1b h5 ya3 ff1 fs3 fc0 sc0 ls0 ws0">if<span class="_ _e"> </span>the<span class="_ _e"> </span>user’<span class="_ _6"></span>s<span class="_ _e"> </span>mouth<span class="_ _e"> </span>is<span class="_ _e"> </span>close<span class="_ _e"> </span>enough<span class="_ _e"> </span>to<span class="_ _e"> </span>the<span class="_ _e"> </span>microphone,<span class="_ _e"> </span>the</div><div class="t m0 x1b h5 ya4 ff1 fs3 fc0 sc0 ls0 ws0">captured<span class="_ _8"> </span>sound<span class="_ _8"> </span>signals<span class="_ _8"> </span>will<span class="_ _8"> </span>not<span class="_ _8"> </span>only<span class="_ _8"> </span>contain<span class="_ _8"> </span>the<span class="_ _8"> </span>speech</div><div class="t m0 x1b h5 ya5 ff1 fs3 fc0 sc0 ls0 ws0">information<span class="_ _5"> </span>but<span class="_ _5"> </span>also<span class="_ _5"> </span>the<span class="_ _5"> </span>plosiv<span class="_ _6"></span>e<span class="_ _3"> </span>burst<span class="_ _d"> </span>as<span class="_ _3"> </span>the<span class="_ _5"> </span>friction<span class="_ _5"> </span>between</div><div class="t m0 x1b h5 ya6 ff1 fs3 fc0 sc0 ls0 ws0">the<span class="_ _5"> </span>lips<span class="_ _3"> </span>and<span class="_ _5"> </span>the<span class="_ _5"> </span>airflow<span class="_ _4"></span>,<span class="_ _3"> </span>known<span class="_ _d"> </span>as<span class="_ _3"> </span>the<span class="_ _5"> </span>pop<span class="_ _3"> </span>noise.<span class="_ _5"> </span>In<span class="_ _3"> </span>contrast,</div><div class="t m0 x1b h5 ya7 ff1 fs3 fc0 sc0 ls0 ws0">an<span class="_ _d"> </span>attacker<span class="_ _d"> </span>who<span class="_ _5"> </span>tries<span class="_ _d"> </span>to<span class="_ _5"> </span>launch<span class="_ _d"> </span>a<span class="_ _d"> </span>replay<span class="_ _5"> </span>attack<span class="_ _d"> </span>usually<span class="_ _5"> </span>cannot</div><div class="t m0 x1b h5 ya8 ff1 fs3 fc0 sc0 ls0 ws0">put<span class="_ _b"> </span>the<span class="_ _14"> </span>microphone<span class="_ _14"> </span>of<span class="_ _14"> </span>the<span class="_ _14"> </span>recording<span class="_ _b"> </span>device<span class="_ _b"> </span>very<span class="_ _b"> </span>close<span class="_ _14"> </span>to</div><div class="t m0 x1b h5 ya9 ff1 fs3 fc0 sc0 ls0 ws0">the<span class="_ _a"> </span>user’<span class="_ _6"></span>s<span class="_ _a"> </span>mouth,<span class="_ _a"> </span>thus<span class="_ _a"> </span>the<span class="_ _a"> </span>recorded<span class="_ _b"> </span>voice<span class="_ _e"> </span>contains<span class="_ _b"> </span>no<span class="_ _a"> </span>pop</div><div class="t m0 x1b h5 yaa ff1 fs3 fc0 sc0 ls0 ws0">noise.<span class="_ _3"> </span>Therefore,<span class="_ _10"> </span>by<span class="_ _10"> </span>detecting<span class="_ _3"> </span>the<span class="_ _10"> </span>pop<span class="_ _10"> </span>noise,<span class="_ _3"> </span>we<span class="_ _10"> </span>are<span class="_ _10"> </span>able<span class="_ _3"> </span>to</div><div class="t m0 x1b h5 yab ff1 fs3 fc0 sc0 ls0 ws0">distinguish<span class="_ _5"> </span>the<span class="_ _3"> </span>real<span class="_ _5"> </span>speech<span class="_ _3"> </span>from<span class="_ _5"> </span>a<span class="_ _3"> </span>liv<span class="_ _6"></span>e<span class="_ _3"> </span>user<span class="_ _5"> </span>and<span class="_ _3"> </span>the<span class="_ _5"> </span>recorded</div><div class="t m0 x1b h5 yac ff1 fs3 fc0 sc0 ls0 ws0">speech<span class="_ _5"> </span>from<span class="_ _5"> </span>a<span class="_ _5"> </span>loudspeaker<span class="_ _4"></span>.</div><div class="t m0 x1c h5 yad ff1 fs3 fc0 sc0 ls0 ws0">T<span class="_ _4"></span>o<span class="_"> </span>detect<span class="_"> </span>pop<span class="_"> </span>noise,<span class="_"> </span>we<span class="_"> </span>compare<span class="_"> </span>the<span class="_ _13"> </span>spectrograms<span class="_"> </span>of<span class="_"> </span>speech</div><div class="t m0 x1b h5 yae ff1 fs3 fc0 sc0 ls0 ws0">signals<span class="_"> </span>with<span class="_"> </span>and<span class="_ _f"> </span>without<span class="_"> </span>a<span class="_ _f"> </span>pop<span class="_"> </span>noise<span class="_ _f"> </span>filter<span class="_"> </span>using<span class="_"> </span>three<span class="_ _f"> </span>different</div><div class="t m0 x1b h5 yaf ff1 fs3 fc0 sc0 ls0 ws0">smartphones,<span class="_"> </span>as<span class="_"> </span>sho<span class="_ _6"></span>wn<span class="_"> </span>in<span class="_"> </span>Fig.<span class="_"> </span>2.<span class="_"> </span>It<span class="_"> </span>can<span class="_"> </span>be<span class="_"> </span>found<span class="_"> </span>that<span class="_"> </span>pop<span class="_"> </span>noise</div><div class="t m0 x1b he yb0 ff1 fs3 fc0 sc0 ls0 ws0">has<span class="_ _d"> </span>a<span class="_ _d"> </span>high<span class="_ _f"> </span>energy<span class="_ _d"> </span>in<span class="_ _d"> </span>the<span class="_ _d"> </span>lo<span class="_ _6"></span>w<span class="_ _d"> </span>frequency<span class="_ _f"> </span>(typically<span class="_ _d"> </span>0<span class="ff9">∼</span>100<span class="_ _d"> </span>Hz),</div><div class="t m0 x1b h5 yb1 ff1 fs3 fc0 sc0 ls0 ws0">which<span class="_ _3"> </span>has<span class="_ _3"> </span>been<span class="_ _10"> </span>discussed<span class="_ _3"> </span>in<span class="_ _3"> </span>the<span class="_ _3"> </span>prior<span class="_ _10"> </span>study<span class="_ _3"> </span>[14].<span class="_ _3"> </span>Moreover<span class="_ _6"></span>,</div><div class="t m0 x1b he yb2 ff1 fs3 fc0 sc0 ls0 ws0">the<span class="_ _10"> </span>duration<span class="_ _10"> </span>of<span class="_ _e"> </span>pop<span class="_ _10"> </span>noise<span class="_ _10"> </span>varies<span class="_ _10"> </span>in<span class="_ _10"> </span>the<span class="_ _10"> </span>range<span class="_ _e"> </span>20<span class="ff9">∼</span>100<span class="_ _10"> </span>msec</div><div class="t m0 x1b h5 yb3 ff1 fs3 fc0 sc0 ls0 ws0">based<span class="_ _e"> </span>on<span class="_ _a"> </span>the<span class="_ _e"> </span>way<span class="_ _e"> </span>people<span class="_ _a"> </span>speak<span class="_ _e"> </span>and<span class="_ _a"> </span>breathe.<span class="_ _e"> </span>Our<span class="_ _a"> </span>detection</div><div class="t m0 x1b h5 yb4 ff1 fs3 fc0 sc0 ls0 ws0">algorithm<span class="_ _5"> </span>is<span class="_ _5"> </span>based<span class="_ _5"> </span>on<span class="_ _5"> </span>these<span class="_ _5"> </span>observations.</div><div class="t m0 x1b h8 yb5 ff5 fs3 fc0 sc0 ls0 ws0">C.<span class="_ _a"> </span>Phoneme<span class="_ _5"> </span>and<span class="_ _5"> </span>P<span class="_ _4"></span>op<span class="_ _5"> </span>Noise</div><div class="t m0 x1c h5 y32 ff1 fs3 fc0 sc0 ls0 ws0">A<span class="_ _11"> </span>phoneme<span class="_ _9"> </span>is<span class="_ _11"> </span>the<span class="_ _9"> </span>smallest<span class="_ _9"> </span>distincti<span class="_ _6"></span>ve<span class="_ _9"> </span>unit<span class="_ _11"> </span>sound<span class="_ _9"> </span>of<span class="_ _11"> </span>a</div><div class="t m0 x1b h5 y33 ff1 fs3 fc0 sc0 ls0 ws0">language<span class="_ _e"> </span>in<span class="_ _e"> </span>the<span class="_ _e"> </span>human<span class="_ _a"> </span>speech<span class="_ _e"> </span>production<span class="_ _e"> </span>system.There<span class="_ _e"> </span>are</div><div class="t m0 x1b h5 y34 ff1 fs3 fc0 sc0 ls0 ws0">two<span class="_ _a"> </span>categories<span class="_ _e"> </span>of<span class="_ _a"> </span>phonemes,<span class="_ _a"> </span>the<span class="_ _a"> </span>vo<span class="_ _6"></span>wel<span class="_ _a"> </span>and<span class="_ _a"> </span>the<span class="_ _b"> </span>consonant.</div><div class="t m0 x1b h5 y35 ff1 fs3 fc0 sc0 ls0 ws0">A<span class="_ _14"> </span>vo<span class="_ _6"></span>wel<span class="_ _14"> </span>is<span class="_ _14"> </span>a<span class="_ _14"> </span>sound<span class="_ _14"> </span>produced<span class="_ _11"> </span>by<span class="_ _14"> </span>the<span class="_ _14"> </span>airflow<span class="_ _b"> </span>through<span class="_ _14"> </span>the</div><div class="t m0 x1b h5 y36 ff1 fs3 fc0 sc0 ls0 ws0">mouth<span class="_ _10"> </span>without<span class="_ _3"> </span>hindrance,<span class="_ _10"> </span>while<span class="_ _10"> </span>a<span class="_ _10"> </span>consonant<span class="_ _3"> </span>is<span class="_ _10"> </span>produced<span class="_ _10"> </span>by</div><div class="t m0 x1b h5 y57 ff1 fs3 fc0 sc0 ls0 ws0">obstructing<span class="_"> </span>the<span class="_"> </span>airflow<span class="_"> </span>out<span class="_"> </span>of<span class="_ _f"> </span>the<span class="_"> </span>mouth<span class="_ _f"> </span>with<span class="_"> </span>the<span class="_ _c"> </span>teeth,<span class="_ _f"> </span>tongue,</div><div class="t m0 x1b h5 y38 ff1 fs3 fc0 sc0 ls0 ws0">lips<span class="_ _10"> </span>or<span class="_ _10"> </span>palate.<span class="_ _10"> </span>Since<span class="_ _10"> </span>each<span class="_ _10"> </span>phoneme<span class="_ _10"> </span>features<span class="_ _10"> </span>unique<span class="_ _10"> </span>physical</div><div class="t m0 x1b h5 y39 ff1 fs3 fc0 sc0 ls0 ws0">origin<span class="_"> </span>in<span class="_ _f"> </span>the<span class="_ _f"> </span>human<span class="_ _f"> </span>vocal<span class="_"> </span>tract<span class="_"> </span>system<span class="_ _f"> </span>and<span class="_ _f"> </span>has<span class="_ _c"> </span>its<span class="_ _f"> </span>own<span class="_"> </span>manner</div></div><div class="pi" data-data='{"ctm":[1.568627,0.000000,0.000000,1.568627,0.000000,0.000000]}'></div></div>