From 795b87fabe97dd10658e7180fa165c9be792a08f Mon Sep 17 00:00:00 2001 From: Peter Monks Date: Tue, 3 Mar 2026 11:05:43 -0800 Subject: [PATCH 1/4] :ambulance: Fix for issue #394 - cache stampede --- .../java/org/spdx/utility/DownloadCache.java | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/spdx/utility/DownloadCache.java b/src/main/java/org/spdx/utility/DownloadCache.java index 3cdb7548..7ed4773b 100644 --- a/src/main/java/org/spdx/utility/DownloadCache.java +++ b/src/main/java/org/spdx/utility/DownloadCache.java @@ -162,8 +162,9 @@ public void resetCache() throws IOException { } /** - * @param url The URL to get an input stream for. Note that redirects issued by this url are restricted to known - * SPDX hosts. Redirects to other hosts will cause an IOException to be thrown. + * @param url The URL to get an input stream for. Notes: redirects issued by this url are restricted to known + * SPDX hosts; redirects to other hosts will cause an IOException to be thrown. This method may + * synchronize on this argument, so callers should avoid using it for synchronization. * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete * types, depending on whether the content is being served out of cache or not. * @throws IOException When an IO error of some kind occurs. @@ -173,7 +174,8 @@ public InputStream getUrlInputStream(final URL url) throws IOException { } /** - * @param url The URL to get an input stream for. + * @param url The URL to get an input stream for. Note: this method may synchronize on this argument, so callers + * should avoid using it for synchronization. * @param restrictRedirects A flag that controls whether redirects returned by url are restricted to known SPDX * hosts or not. Defaults to true. USE EXTREME CAUTION WHEN TURNING THIS OFF! * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete @@ -184,7 +186,10 @@ public InputStream getUrlInputStream(final URL url, final boolean restrictRedire InputStream result = null; if (url != null) { if (cacheEnabled) { - result = getUrlInputStreamThroughCache(url, restrictRedirects); + // Per-URL critical section to prevent cache stampede + synchronized (url) { + result = getUrlInputStreamThroughCache(url, restrictRedirects); + } } else { result = getUrlInputStreamDirect(url, restrictRedirects); } @@ -228,24 +233,26 @@ private InputStream getUrlInputStreamDirect(URL url, boolean restrictRedirects) * @throws IOException When an IO error of some kind occurs. */ private InputStream getUrlInputStreamThroughCache(final URL url, boolean restrictRedirects) throws IOException { - final String cacheKey = base64Encode(url); - final File cachedFile = new File(cacheDir, cacheKey); - final File cachedMetadataFile = new File(cacheDir, cacheKey + ".metadata.json"); - - if (cachedFile.exists() && cachedMetadataFile.exists()) { - try { - checkCache(url, restrictRedirects); - } catch (IOException ioe) { - // We know we have a locally cached file here, so if we happen to get an exception we can safely ignore - // it and fall back on the (possibly stale) cached content file. This makes the code more robust in the - // presence of network errors when the cache has previously been populated. + synchronized (url) { + final String cacheKey = base64Encode(url); + final File cachedFile = new File(cacheDir, cacheKey); + final File cachedMetadataFile = new File(cacheDir, cacheKey + ".metadata.json"); + + if (cachedFile.exists() && cachedMetadataFile.exists()) { + try { + checkCache(url, restrictRedirects); + } catch (IOException ioe) { + // We know we have a locally cached file here, so if we happen to get an exception we can safely ignore + // it and fall back on the (possibly stale) cached content file. This makes the code more robust in the + // presence of network errors when the cache has previously been populated. + } + } else { + cacheMiss(url, restrictRedirects); } - } else { - cacheMiss(url, restrictRedirects); - } - // At this point the cached file definitely exists - return new BufferedInputStream(Files.newInputStream(cachedFile.toPath())); + // At this point the cached file definitely exists + return new BufferedInputStream(Files.newInputStream(cachedFile.toPath())); + } } /** From ff2586c61706dd066da5ed9d6a9d89701ac5bfc0 Mon Sep 17 00:00:00 2001 From: Peter Monks Date: Tue, 3 Mar 2026 14:21:11 -0800 Subject: [PATCH 2/4] :ambulance: Switch to explicit collection of locks, as synchronizing on URL objects isn't reliable --- .../java/org/spdx/utility/DownloadCache.java | 95 +++++++++++++------ 1 file changed, 66 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/spdx/utility/DownloadCache.java b/src/main/java/org/spdx/utility/DownloadCache.java index 7ed4773b..3e65a5a3 100644 --- a/src/main/java/org/spdx/utility/DownloadCache.java +++ b/src/main/java/org/spdx/utility/DownloadCache.java @@ -30,6 +30,8 @@ import java.io.Reader; import java.io.Writer; import java.net.HttpURLConnection; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -44,6 +46,9 @@ import java.util.HashMap; import java.util.List; import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; @@ -107,7 +112,7 @@ private DownloadCache() { try { final File cacheDirectory = new File(cacheDir); Files.createDirectories(cacheDirectory.toPath()); - } catch (IOException ioe) { + } catch (final IOException ioe) { logger.warn("Unable to create cache directory '{}'; continuing with cache disabled.", cacheDir, ioe); tmpCacheEnabled = false; } @@ -116,7 +121,7 @@ private DownloadCache() { long tmpCacheCheckIntervalSecs = DEFAULT_CACHE_CHECK_INTERVAL_SECS; try { tmpCacheCheckIntervalSecs = Long.parseLong(Configuration.getInstance().getProperty(CONFIG_PROPERTY_CACHE_CHECK_INTERVAL_SECS)); - } catch(NumberFormatException nfe) { + } catch (final NumberFormatException nfe) { // Ignore parse failures - in this case we use the default value of 24 hours } cacheCheckIntervalSecs = tmpCacheCheckIntervalSecs; @@ -163,10 +168,10 @@ public void resetCache() throws IOException { /** * @param url The URL to get an input stream for. Notes: redirects issued by this url are restricted to known - * SPDX hosts; redirects to other hosts will cause an IOException to be thrown. This method may - * synchronize on this argument, so callers should avoid using it for synchronization. - * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete - * types, depending on whether the content is being served out of cache or not. + * SPDX hosts; redirects to other hosts will cause an IOException to be thrown. + * @return An InputStream for url, or null if url is null or the method is interrupted during execution. Note that + * this InputStream may be of different concrete types, depending on whether the content is being served out + * of cache or not. * @throws IOException When an IO error of some kind occurs. */ public InputStream getUrlInputStream(final URL url) throws IOException { @@ -174,22 +179,56 @@ public InputStream getUrlInputStream(final URL url) throws IOException { } /** - * @param url The URL to get an input stream for. Note: this method may synchronize on this argument, so callers - * should avoid using it for synchronization. + * @param url The URL to normalize. + * @return A normalized rendition of the url, as a String. + */ + private static String normalizeURL(final URL url) { + String result = null; + + if (url != null) { + try { + URI uri = new URI(url.toString()).normalize(); // JDK normalization + + // Then manually strip fragment as well + uri = new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), uri.getPath(), uri.getQuery(), null); + result = uri.toString(); + } catch (final URISyntaxException e) { + result = url.toString(); // Fallback on naive stringification if normalization fails + } + } + + return result; + } + + // A collection of per-URL locks - note that this will grow without bound as the number of URLs requested through the cache grows + private final ConcurrentHashMap perUrlLocks = new ConcurrentHashMap<>(); + + /** + * @param url The URL to get an input stream for. * @param restrictRedirects A flag that controls whether redirects returned by url are restricted to known SPDX * hosts or not. Defaults to true. USE EXTREME CAUTION WHEN TURNING THIS OFF! - * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete - * types, depending on whether the content is being served out of cache or not. + * @return An InputStream for url, or null if url is null or the method is interrupted during execution. Note that + * this InputStream may be of different concrete types, depending on whether the content is being served out + * of cache or not. * @throws IOException When an IO error of some kind occurs. */ public InputStream getUrlInputStream(final URL url, final boolean restrictRedirects) throws IOException { InputStream result = null; + if (url != null) { if (cacheEnabled) { - // Per-URL critical section to prevent cache stampede - synchronized (url) { + // Per-URL critical section (to prevent cache stampede) + final String normalizedUrl = normalizeURL(url); + perUrlLocks.putIfAbsent(normalizedUrl, new ReentrantLock()); + final Lock lock = perUrlLocks.get(normalizedUrl); + lock.lock(); + + try { result = getUrlInputStreamThroughCache(url, restrictRedirects); + } finally { + lock.unlock(); } + // End of per-URL critical section } else { result = getUrlInputStreamDirect(url, restrictRedirects); } @@ -233,26 +272,24 @@ private InputStream getUrlInputStreamDirect(URL url, boolean restrictRedirects) * @throws IOException When an IO error of some kind occurs. */ private InputStream getUrlInputStreamThroughCache(final URL url, boolean restrictRedirects) throws IOException { - synchronized (url) { - final String cacheKey = base64Encode(url); - final File cachedFile = new File(cacheDir, cacheKey); - final File cachedMetadataFile = new File(cacheDir, cacheKey + ".metadata.json"); + final String cacheKey = base64Encode(url); + final File cachedFile = new File(cacheDir, cacheKey); + final File cachedMetadataFile = new File(cacheDir, cacheKey + ".metadata.json"); - if (cachedFile.exists() && cachedMetadataFile.exists()) { - try { - checkCache(url, restrictRedirects); - } catch (IOException ioe) { - // We know we have a locally cached file here, so if we happen to get an exception we can safely ignore - // it and fall back on the (possibly stale) cached content file. This makes the code more robust in the - // presence of network errors when the cache has previously been populated. - } - } else { - cacheMiss(url, restrictRedirects); + if (cachedFile.exists() && cachedMetadataFile.exists()) { + try { + checkCache(url, restrictRedirects); + } catch (IOException ioe) { + // We know we have a locally cached file here, so if we happen to get an exception we can safely ignore + // it and fall back on the (possibly stale) cached content file. This makes the code more robust in the + // presence of network errors when the cache has previously been populated. } + } else { + cacheMiss(url, restrictRedirects); + } - // At this point the cached file definitely exists - return new BufferedInputStream(Files.newInputStream(cachedFile.toPath())); - } + // At this point the cached file definitely exists + return new BufferedInputStream(Files.newInputStream(cachedFile.toPath())); } /** From 6495e1c8ad133e11f1f833df45fe16dc688be44c Mon Sep 17 00:00:00 2001 From: Peter Monks Date: Tue, 3 Mar 2026 17:32:43 -0800 Subject: [PATCH 3/4] Update src/main/java/org/spdx/utility/DownloadCache.java Only construct new `ReentrantLock` if needed. Co-authored-by: Gary O'Neall Signed-off-by: Peter Monks --- src/main/java/org/spdx/utility/DownloadCache.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/spdx/utility/DownloadCache.java b/src/main/java/org/spdx/utility/DownloadCache.java index 3e65a5a3..ddd45dc9 100644 --- a/src/main/java/org/spdx/utility/DownloadCache.java +++ b/src/main/java/org/spdx/utility/DownloadCache.java @@ -219,7 +219,7 @@ public InputStream getUrlInputStream(final URL url, final boolean restrictRedire if (cacheEnabled) { // Per-URL critical section (to prevent cache stampede) final String normalizedUrl = normalizeURL(url); - perUrlLocks.putIfAbsent(normalizedUrl, new ReentrantLock()); + perUrlLocks.computeIfAbsent(normalizedUrl, k -> new ReentrantLock()); final Lock lock = perUrlLocks.get(normalizedUrl); lock.lock(); From 53df6d8d738c20682cafb9a801b5a6e47ed62bc5 Mon Sep 17 00:00:00 2001 From: Peter Monks Date: Tue, 3 Mar 2026 18:56:01 -0800 Subject: [PATCH 4/4] :books: Correct JavaDoc comment --- src/main/java/org/spdx/utility/DownloadCache.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/spdx/utility/DownloadCache.java b/src/main/java/org/spdx/utility/DownloadCache.java index ddd45dc9..b2edf0ea 100644 --- a/src/main/java/org/spdx/utility/DownloadCache.java +++ b/src/main/java/org/spdx/utility/DownloadCache.java @@ -169,9 +169,8 @@ public void resetCache() throws IOException { /** * @param url The URL to get an input stream for. Notes: redirects issued by this url are restricted to known * SPDX hosts; redirects to other hosts will cause an IOException to be thrown. - * @return An InputStream for url, or null if url is null or the method is interrupted during execution. Note that - * this InputStream may be of different concrete types, depending on whether the content is being served out - * of cache or not. + * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete + * types, depending on whether the content is being served out of cache or not. * @throws IOException When an IO error of some kind occurs. */ public InputStream getUrlInputStream(final URL url) throws IOException { @@ -207,9 +206,8 @@ private static String normalizeURL(final URL url) { * @param url The URL to get an input stream for. * @param restrictRedirects A flag that controls whether redirects returned by url are restricted to known SPDX * hosts or not. Defaults to true. USE EXTREME CAUTION WHEN TURNING THIS OFF! - * @return An InputStream for url, or null if url is null or the method is interrupted during execution. Note that - * this InputStream may be of different concrete types, depending on whether the content is being served out - * of cache or not. + * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete + * types, depending on whether the content is being served out of cache or not. * @throws IOException When an IO error of some kind occurs. */ public InputStream getUrlInputStream(final URL url, final boolean restrictRedirects) throws IOException { @@ -268,7 +266,7 @@ private InputStream getUrlInputStreamDirect(URL url, boolean restrictRedirects) * @param restrictRedirects A flag that controls whether redirects returned by url are restricted to known SPDX * hosts or not. Defaults to true. USE EXTREME CAUTION WHEN TURNING THIS OFF! * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete - * types, depending on whether the content is being served out of cache or not. + * types, depending on whether the content is being served out of cache or not. * @throws IOException When an IO error of some kind occurs. */ private InputStream getUrlInputStreamThroughCache(final URL url, boolean restrictRedirects) throws IOException {