diff --git a/src/main/java/org/spdx/utility/DownloadCache.java b/src/main/java/org/spdx/utility/DownloadCache.java index 3cdb7548..b2edf0ea 100644 --- a/src/main/java/org/spdx/utility/DownloadCache.java +++ b/src/main/java/org/spdx/utility/DownloadCache.java @@ -30,6 +30,8 @@ import java.io.Reader; import java.io.Writer; import java.net.HttpURLConnection; +import java.net.URI; +import java.net.URISyntaxException; import java.net.URL; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -44,6 +46,9 @@ import java.util.HashMap; import java.util.List; import java.util.Objects; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; import com.google.gson.Gson; import com.google.gson.reflect.TypeToken; @@ -107,7 +112,7 @@ private DownloadCache() { try { final File cacheDirectory = new File(cacheDir); Files.createDirectories(cacheDirectory.toPath()); - } catch (IOException ioe) { + } catch (final IOException ioe) { logger.warn("Unable to create cache directory '{}'; continuing with cache disabled.", cacheDir, ioe); tmpCacheEnabled = false; } @@ -116,7 +121,7 @@ private DownloadCache() { long tmpCacheCheckIntervalSecs = DEFAULT_CACHE_CHECK_INTERVAL_SECS; try { tmpCacheCheckIntervalSecs = Long.parseLong(Configuration.getInstance().getProperty(CONFIG_PROPERTY_CACHE_CHECK_INTERVAL_SECS)); - } catch(NumberFormatException nfe) { + } catch (final NumberFormatException nfe) { // Ignore parse failures - in this case we use the default value of 24 hours } cacheCheckIntervalSecs = tmpCacheCheckIntervalSecs; @@ -162,16 +167,41 @@ public void resetCache() throws IOException { } /** - * @param url The URL to get an input stream for. Note that redirects issued by this url are restricted to known - * SPDX hosts. Redirects to other hosts will cause an IOException to be thrown. + * @param url The URL to get an input stream for. Notes: redirects issued by this url are restricted to known + * SPDX hosts; redirects to other hosts will cause an IOException to be thrown. * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete - * types, depending on whether the content is being served out of cache or not. + * types, depending on whether the content is being served out of cache or not. * @throws IOException When an IO error of some kind occurs. */ public InputStream getUrlInputStream(final URL url) throws IOException { return getUrlInputStream(url, true); } + /** + * @param url The URL to normalize. + * @return A normalized rendition of the url, as a String. + */ + private static String normalizeURL(final URL url) { + String result = null; + + if (url != null) { + try { + URI uri = new URI(url.toString()).normalize(); // JDK normalization + + // Then manually strip fragment as well + uri = new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), uri.getPort(), uri.getPath(), uri.getQuery(), null); + result = uri.toString(); + } catch (final URISyntaxException e) { + result = url.toString(); // Fallback on naive stringification if normalization fails + } + } + + return result; + } + + // A collection of per-URL locks - note that this will grow without bound as the number of URLs requested through the cache grows + private final ConcurrentHashMap perUrlLocks = new ConcurrentHashMap<>(); + /** * @param url The URL to get an input stream for. * @param restrictRedirects A flag that controls whether redirects returned by url are restricted to known SPDX @@ -182,9 +212,21 @@ public InputStream getUrlInputStream(final URL url) throws IOException { */ public InputStream getUrlInputStream(final URL url, final boolean restrictRedirects) throws IOException { InputStream result = null; + if (url != null) { if (cacheEnabled) { - result = getUrlInputStreamThroughCache(url, restrictRedirects); + // Per-URL critical section (to prevent cache stampede) + final String normalizedUrl = normalizeURL(url); + perUrlLocks.computeIfAbsent(normalizedUrl, k -> new ReentrantLock()); + final Lock lock = perUrlLocks.get(normalizedUrl); + lock.lock(); + + try { + result = getUrlInputStreamThroughCache(url, restrictRedirects); + } finally { + lock.unlock(); + } + // End of per-URL critical section } else { result = getUrlInputStreamDirect(url, restrictRedirects); } @@ -224,7 +266,7 @@ private InputStream getUrlInputStreamDirect(URL url, boolean restrictRedirects) * @param restrictRedirects A flag that controls whether redirects returned by url are restricted to known SPDX * hosts or not. Defaults to true. USE EXTREME CAUTION WHEN TURNING THIS OFF! * @return An InputStream for url, or null if url is null. Note that this InputStream may be of different concrete - * types, depending on whether the content is being served out of cache or not. + * types, depending on whether the content is being served out of cache or not. * @throws IOException When an IO error of some kind occurs. */ private InputStream getUrlInputStreamThroughCache(final URL url, boolean restrictRedirects) throws IOException {