最大子数组问题

最大子数组问题

Maximum-Subarray

本文讨论的数组均为存在负值元素的数组。数组下标从0开始。

返回值定义

1
(Low, High, Sum)

分别表示子数组左端点下标、右端点下标和子数组之和。

暴力解法

遍历 \(C_n^2+n=\frac{n(n+1)}{2}\) 种可能的区间情况。利用之前计算出的子数组之和来计算下一个子数组之和,使得算法达到 \(O(n^2)\) 的时间复杂度。

1
2
3
4
5
6
7
8
9
10
11
12
13
Tracerse_Solution(A, Low, High)
result_Sum = -infty
sum = 0
sum_Last = 0
for i = Low to High
for j = i to High
sum = sum_Last + A[j]
if sum > result_Sum
result_Sum = sum
result_Low = i
result_High = j
sum_Last = 0
return (result_Low, result_High, result_Sum)

使用 sum_Last 记录上一个子数组之和。

假设子数组之和 \(SUM\) 是均匀分布: \[ p(SUM)= \begin{cases} \frac{1}{A-I}&,I\leq SUM\leq A\\ 0&,Others \end{cases} \] 其中,\(I\) 为 MIN,\(A\) 为 MAX。

通过如下程序进行模拟,观察后项比前项大的情况个数比例,模拟结果为 \(0.4949\approx0.5\)

那么 if 语句块的执行次数的期望为: \[ E(Comp)=\frac{n(n+1)}{4} \] 算法时间复杂度为: \[ \begin{align} T(n)=&c_1+c_2+c_3+c_{15}+(c_4+c_{11})\cdot n+(c_5+c_6)\cdot\frac{n(n+1)}{2}+\\&(c_7+c_8+c_9+c_{10})\cdot\frac{n(n+1)}{4}\\ =&O(n^2) \end{align} \]

分治解法

将子数组分为左侧、右侧和横跨中点的子数组。

计算横跨中点的最大子数组:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Find_Max_Crossing_Subarray(A, Low, High)
Mid = (Low + High) / 2
Low_Sum = -infty
High_Sum = -infty
sum = 0
for i = Mid down to Low
sum = sum + A[i]
if sum > Low_Sum
Low_Sum = sum
result_Low = i
sum = 0
for i = Mid + 1 to High
sum = sum + A[i]
if sum > High_Sum
High_Sum = sum
result_High = i
return (result_Low, result_High, Low_Sum + High_Sum)

\[ \begin{align} T(n)=&\sum_{i=1}^4c_i+c_{10}+(c_5+c_6)\lfloor\frac{n}{2}\rfloor+(c_{11}+c_{12})\lceil\frac{n}{2}\rceil+\\&(\sum_{i=7}^9c_i+\sum_{i=13}^{15}c_i)\frac{n}{2}\\ =&O(n) \end{align} \]

计算最大子数组,递归、分治:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
Find_Max_Subarray(A, Low, High)
if Low == High
return (Low, High, A[Low])
else
Mid = (Low + High) / 2
(Low_Low, Low_High, Low_Sum) = Find_Max_Subarray(A, Low, Mid)
(Mid_Low, Mid_High, Mid_Sum) = Find_Max_Crossing_Subarray(A, Low, High)
(High_Low, High_High, High_Sum) = Find_Max_Subarray(A, Mid + 1, High)
if Low_Sum >= Mid_Sum and Low_Sum >= High_Sum
return (Low_Low, Low_High, Low_Sum)
else if Mid_Sum >= Low_Sum and Mid_Sum >= High_Sum
return (Mid_Low, Mid_High, Mid_Sum)
else
return (High_Low, High_High, High_Sum)

\[ T(n)= \begin{cases} O(1)&,n=1\\ 2T(n/2)+O(n)&,n>1 \end{cases} \Rightarrow T(n)=O(n\lg{n}) \]

性能分析

随机生成 \([-DoubleTop/2,DoubleTop/2]\) 范围内的长为 \(Len\) 的数组,通过C语言代码进行时间测试,精度为 \(1\;\text{ms}\)。结果发现在数组长度约为 \(500\) 时,遍历算法与分治算法的耗时相等;数组长度超过 \(500\) 后,分治算法的耗时明显低于遍历算法。

线性时间算法(毛毛虫算法)

思路:若已知 \(A[1,j]\) 的最大子数组,那么 \(A[1,j+1]\) 的最大子数组可能有两种情况:

  1. 仍然为 \(A[1,j]\) 的最大子数组;
  2. 为子数组 \(A[i,j+1],\;1\leq i\leq j+1\)

首先,对于已知的最大子数组 \(A[i,j]\),需要不断向前计算 \(sum(A[i,j+1])\)\(sum(A[i,j+2])\) ……比较它们与已知最大子数组之和的大小,若更大,则更新当前最大的子数组。

然而,一旦 \(sum(A[i,j+k])<0\),则向前计算没有意义,直接考虑 \(A[k+1]\) 与已知最大子数组之和的大小关系。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
Caterpiller_Linear_Solution(A, Low, High)
result_Low = Low
result_High = Low
result_Sum = A[Low]
pioneer = A[Low]
tmp_Low = Low
for i = Low + 1 to High
if pioneer > 0
pioneer = pioneer + A[i]
else
pioneer = A[i]
tmp_Low = i
if result_Sum < pioneer
result_Sum = pioneer
result_Low = tmp_Low
result_High = i
return (result_Low, result_High, result_Sum)

使用 pioneer 计算从当前最大子数组开始,向前的子数组之和;

使用 tmp_Low 记录当 pioneer < 0 时,待考虑的新的子数组起始位置。

C语言代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/*
** @FileName: Max_SubArray.c
** @Author: QCF
** @Date: 07-08-2020
** @Version: 1.0.0
** @Description: 求最大子数组算法,分治。
** @Encoding: UTF-8
*/

#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <windows.h>

struct MAX_SUB_ARRAY;
typedef struct MAX_SUB_ARRAY max_subarray;

struct MAX_SUB_ARRAY{
int Low, High;
double Sum;
};

/* 横跨中点的最大子数组 */
max_subarray *Find_Max_Crossing_Subarray(int *A, int Low, int Mid, int High)
{
int i;
max_subarray *Result = (max_subarray *)calloc(1, sizeof(struct MAX_SUB_ARRAY));
int sum = 0, Left_Sum = INT_MIN, Right_Sum = INT_MIN;

for (i = Mid; i >= Low; i--)
{
sum += A[i];
if (sum > Left_Sum)
{
Left_Sum = sum;
Result->Low = i;
}
}

sum = 0;
for (i = Mid + 1; i <= High; i++)
{
sum += A[i];
if (sum > Right_Sum)
{
Right_Sum = sum;
Result->High = i;
}
}

Result->Sum = Left_Sum + Right_Sum;

return Result;
}

/* 分治算法解决最大子数组 */
max_subarray *Find_Max_Subarray(int *A, int Low, int High)
{
max_subarray *Result = (max_subarray *)calloc(1, sizeof(struct MAX_SUB_ARRAY));
max_subarray *LowResult, *HighResult, *MidResult;
int Mid;

/* 单元素数组或者空数组 */
if (Low == High || (Low < 0 && High < 0))
{
Result->Low = Low;
Result->High = High;
Result->Sum = A[Low];
return Result;
}
else
{
Mid = (Low + High) / 2;
LowResult = Find_Max_Subarray(A, Low, Mid);
MidResult = Find_Max_Crossing_Subarray(A, Low, Mid, High);
HighResult = Find_Max_Subarray(A, Mid + 1, High);
if(LowResult->Sum >= MidResult->Sum && LowResult->Sum >= HighResult->Sum) return LowResult;
else if(MidResult->Sum >= LowResult->Sum && MidResult->Sum >= HighResult->Sum) return MidResult;
else return HighResult;
}
}

/* 遍历算法解决最大子数组问题 */
max_subarray *Traverse_Solution(int *A, int Low, int High)
{
int i, j;
int sum = 0, sum_Last = 0;;
max_subarray *Result = (max_subarray *)calloc(1, sizeof(struct MAX_SUB_ARRAY));

Result->Sum = INT_MIN;

/* 空数组,返回0 */
if (Low < 0 && High < 0)
{
Result->Low = -1;
Result->High = -1;
Result->Sum = 0;
return Result;
}

for (i = Low; i <= High; i++)
{
for (j = i; j <= High; j++)
{
sum = sum_Last + A[j];
sum_Last = sum;
if (sum > Result->Sum)
{
Result->Sum = sum;
Result->Low = i;
Result->High = j;
}
}
sum_Last = 0;
}
return Result;
}


/* 毛毛虫算法(线性时间算法) */
max_subarray *Caterpiller_Linear_Solution(int *A, int Low, int High)
{
max_subarray *Result = (max_subarray *)calloc(1, sizeof(struct MAX_SUB_ARRAY));

Result->Low = Low;
Result->High = Low;
Result->Sum = A[Low];

int i, wall = A[Low], tmp_Low = Low;

for (i = Low + 1; i <= High; i++)
{
if (wall > 0)
{
wall += A[i];
}
else
{
wall = A[i];
tmp_Low = i;
}
if (Result->Sum < wall)
{
Result->Sum = wall;
Result->High = i;
Result->Low = tmp_Low;
}
}
return Result;
}

int main()
{
/* 随机生成指定数量的数组 */
int Seed = 2020;
int Len = 500;
int DoubleTop = 500;
int *Test = (int *)calloc(Len, sizeof(int));
int i;
printf("Test:");
for (i = 0; i < Len; i++)
{
Test[i] = rand() % DoubleTop - DoubleTop / 2;
printf("%d ", Test[i]);
}
putchar('\n');

/* 开始测试 */
SYSTEMTIME st, ed;
long long interval;
max_subarray *Result = (max_subarray *)calloc(1, sizeof(struct MAX_SUB_ARRAY));
/* 分治解法,O(n lg n) */
GetSystemTime(&st);
Result = Find_Max_Subarray(Test, 0, Len - 1);
GetSystemTime(&ed);
interval = (ed.wMilliseconds - st.wMilliseconds) + (ed.wSecond - st.wSecond) * 1000;
printf("Max Subarray (div):\n");
printf("\tLow: %d\n\tHigh: %d\n\tSum: %lf\n\tInterval: %lld\n",
Result->Low, Result->High, Result->Sum, interval);
/* 遍历解法,O(n^2) */
GetSystemTime(&st);
Result = Traverse_Solution(Test, 0, Len - 1);
GetSystemTime(&ed);
interval = (ed.wMilliseconds - st.wMilliseconds) + (ed.wSecond - st.wSecond) * 1000;
printf("Max Subarray (tra):\n");
printf("\tLow: %d\n\tHigh: %d\n\tSum: %lf\n\tInterval: %lld\n",
Result->Low, Result->High, Result->Sum, interval);
/* 毛毛虫,线性时间解法,O(n) */
GetSystemTime(&st);
Result = Caterpiller_Linear_Solution(Test, 0, Len - 1);
GetSystemTime(&ed);
interval = (ed.wMilliseconds - st.wMilliseconds) + (ed.wSecond - st.wSecond) * 1000;
printf("Max Subarray (Cal):\n");
printf("\tLow: %d\n\tHigh: %d\n\tSum: %lf\n\tInterval: %lld\n",
Result->Low, Result->High, Result->Sum, interval);
return 0;
}

模拟程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
:FileName: comp_simulation.py
:Date: 07-08-2020
:Version: 1.0.0
:Description: 最大子数组的遍历解法。假设子数组之和的分布在一定范围是均匀分布,模拟探究前项小于后项的次数。
"""
__author__ = "QCF"

import random

# 随机数种子
SEED = 2020
# 模拟次数
TIMES = 50000
# 子数组之和的个数
N = 200
# 子数组之和的上下界
Low = -10000
High = 10000
# 设置种子
random.seed(SEED)

# 开始模拟
result = [0 for i in range(TIMES)]
for t in range(TIMES):
# 生成N个随机数的列表
random_gen = [random.randint(Low, High) for i in range(N)]
# 后项更大的情况的次数
SUM = 0
for i in range(2, len(random_gen)):
if random_gen[i] > random_gen[i - 1]:
SUM += 1
result[t] = SUM
print("Average Prop: ", sum(result) / (N * len(result)))