xref: /haiku/src/add-ons/kernel/network/protocols/tcp/TCPEndpoint.cpp (revision 746cac055adc6ac3308c7bc2d29040fb95689cc9)
1 /*
2  * Copyright 2006-2008, Haiku, Inc. All Rights Reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  *		Andrew Galante, haiku.galante@gmail.com
7  *		Axel Dörfler, axeld@pinc-software.de
8  *		Hugo Santos, hugosantos@gmail.com
9  */
10 
11 
12 #include "TCPEndpoint.h"
13 
14 #include <netinet/in.h>
15 #include <netinet/ip.h>
16 #include <netinet/tcp.h>
17 #include <new>
18 #include <signal.h>
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <KernelExport.h>
23 #include <Select.h>
24 
25 #include <net_buffer.h>
26 #include <net_datalink.h>
27 #include <net_stat.h>
28 #include <NetBufferUtilities.h>
29 #include <NetUtilities.h>
30 
31 #include <lock.h>
32 #include <tracing.h>
33 #include <util/AutoLock.h>
34 #include <util/khash.h>
35 #include <util/list.h>
36 
37 #include "EndpointManager.h"
38 
39 
40 // References:
41 //  - RFC 793 - Transmission Control Protocol
42 //  - RFC 813 - Window and Acknowledgement Strategy in TCP
43 //	- RFC 1337 - TIME_WAIT Assassination Hazards in TCP
44 //
45 // Things this implementation currently doesn't implement:
46 //	- TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery,
47 //	  RFC 2001, RFC 2581, RFC 3042
48 //	- NewReno Modification to TCP's Fast Recovery, RFC 2582
49 //	- Explicit Congestion Notification (ECN), RFC 3168
50 //	- SYN-Cache
51 //	- TCP Extensions for High Performance, RFC 1323
52 //	- SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517
53 //	- Forward RTO-Recovery, RFC 4138
54 //	- Time-Wait hash instead of keeping sockets alive
55 
56 #define PrintAddress(address) \
57 	AddressString(Domain(), address, true).Data()
58 
59 //#define TRACE_TCP
60 //#define PROBE_TCP
61 
62 #ifdef TRACE_TCP
63 // the space before ', ##args' is important in order for this to work with cpp 2.95
64 #	define TRACE(format, args...)	dprintf("%ld: TCP [%llu] %p (%12s) " \
65 		format "\n", find_thread(NULL), system_time(), this, \
66 		name_for_state(fState) , ##args)
67 #else
68 #	define TRACE(args...)			do { } while (0)
69 #endif
70 
71 #ifdef PROBE_TCP
72 #	define PROBE(buffer, window) \
73 	dprintf("TCP PROBE %llu %s %s %ld snxt %lu suna %lu cw %lu sst %lu win %lu swin %lu smax-suna %lu savail %lu sqused %lu rto %llu\n", \
74 		system_time(), PrintAddress(buffer->source), \
75 		PrintAddress(buffer->destination), buffer->size, (uint32)fSendNext, \
76 		(uint32)fSendUnacknowledged, fCongestionWindow, fSlowStartThreshold, \
77 		window, fSendWindow, (uint32)(fSendMax - fSendUnacknowledged), \
78 		fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout)
79 #else
80 #	define PROBE(buffer, window)	do { } while (0)
81 #endif
82 
83 #if TCP_TRACING
84 namespace TCPTracing {
85 
86 class Receive : public AbstractTraceEntry {
87 public:
88 	Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window,
89 			net_buffer* buffer)
90 		:
91 		fEndpoint(endpoint),
92 		fBuffer(buffer),
93 		fBufferSize(buffer->size),
94 		fSequence(segment.sequence),
95 		fAcknowledge(segment.acknowledge),
96 		fWindow(window),
97 		fState(endpoint->State()),
98 		fFlags(segment.flags)
99 	{
100 		Initialized();
101 	}
102 
103 	virtual void AddDump(TraceOutput& out)
104 	{
105 		out.Print("tcp:%p (%12s) receive buffer %p (%lu bytes), flags %x, "
106 			"seq %lu, ack %lu, wnd %lu", fEndpoint, name_for_state(fState),
107 			fBuffer, fBufferSize, fFlags, fSequence, fAcknowledge, fWindow);
108 	}
109 
110 protected:
111 	TCPEndpoint*	fEndpoint;
112 	net_buffer*		fBuffer;
113 	uint32			fBufferSize;
114 	uint32			fSequence;
115 	uint32			fAcknowledge;
116 	uint32			fWindow;
117 	tcp_state		fState;
118 	uint8			fFlags;
119 };
120 
121 class Send : public AbstractTraceEntry {
122 public:
123 	Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer,
124 			uint32 firstSequence, uint32 lastSequence)
125 		:
126 		fEndpoint(endpoint),
127 		fBuffer(buffer),
128 		fBufferSize(buffer->size),
129 		fSequence(segment.sequence),
130 		fAcknowledge(segment.acknowledge),
131 		fFirstSequence(firstSequence),
132 		fLastSequence(lastSequence),
133 		fState(endpoint->State()),
134 		fFlags(segment.flags)
135 	{
136 		Initialized();
137 	}
138 
139 	virtual void AddDump(TraceOutput& out)
140 	{
141 		out.Print("tcp:%p (%12s) send buffer %p (%lu bytes), flags %x, "
142 			"seq %lu, ack %lu, first %lu, last %lu",
143 			fEndpoint, name_for_state(fState), fBuffer, fBufferSize, fFlags,
144 			fSequence, fAcknowledge, fFirstSequence, fLastSequence);
145 	}
146 
147 protected:
148 	TCPEndpoint*	fEndpoint;
149 	net_buffer*		fBuffer;
150 	uint32			fBufferSize;
151 	uint32			fSequence;
152 	uint32			fAcknowledge;
153 	uint32			fFirstSequence;
154 	uint32			fLastSequence;
155 	tcp_state		fState;
156 	uint8			fFlags;
157 };
158 
159 class State : public AbstractTraceEntry {
160 public:
161 	State(TCPEndpoint* endpoint)
162 		:
163 		fEndpoint(endpoint),
164 		fState(endpoint->State())
165 	{
166 		Initialized();
167 	}
168 
169 	virtual void AddDump(TraceOutput& out)
170 	{
171 		out.Print("tcp:%p (%12s) state change", fEndpoint,
172 			name_for_state(fState));
173 	}
174 
175 protected:
176 	TCPEndpoint*	fEndpoint;
177 	tcp_state		fState;
178 };
179 
180 class Timer : public AbstractTraceEntry {
181 public:
182 	Timer(TCPEndpoint* endpoint, const char* which)
183 		:
184 		fEndpoint(endpoint),
185 		fWhich(which),
186 		fState(endpoint->State())
187 	{
188 		Initialized();
189 	}
190 
191 	virtual void AddDump(TraceOutput& out)
192 	{
193 		out.Print("tcp:%p (%12s) %s timer", fEndpoint,
194 			name_for_state(fState), fWhich);
195 	}
196 
197 protected:
198 	TCPEndpoint*	fEndpoint;
199 	const char*		fWhich;
200 	tcp_state		fState;
201 };
202 
203 }	// namespace TCPTracing
204 
205 #	define T(x)	new(std::nothrow) TCPTracing::x
206 #else
207 #	define T(x)
208 #endif	// TCP_TRACING
209 
210 // Initial estimate for packet round trip time (RTT)
211 #define TCP_INITIAL_RTT		2000000
212 
213 // constants for the fFlags field
214 enum {
215 	FLAG_OPTION_WINDOW_SCALE	= 0x01,
216 	FLAG_OPTION_TIMESTAMP		= 0x02,
217 	// TODO: Should FLAG_NO_RECEIVE apply as well to received connections?
218 	//       That is, what is expected from accept() after a shutdown()
219 	//       is performed on a listen()ing socket.
220 	FLAG_NO_RECEIVE				= 0x04,
221 	FLAG_CLOSED					= 0x08,
222 	FLAG_DELETE_ON_CLOSE		= 0x10,
223 	FLAG_LOCAL					= 0x20
224 };
225 
226 
227 static const int kTimestampFactor = 1024;
228 
229 
230 static inline bigtime_t
231 absolute_timeout(bigtime_t timeout)
232 {
233 	if (timeout == 0 || timeout == B_INFINITE_TIMEOUT)
234 		return timeout;
235 
236 	return timeout + system_time();
237 }
238 
239 
240 static inline status_t
241 posix_error(status_t error)
242 {
243 	if (error == B_TIMED_OUT)
244 		return B_WOULD_BLOCK;
245 
246 	return error;
247 }
248 
249 
250 static inline bool
251 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext,
252 	uint32 receiveWindow)
253 {
254 	return sequence >= receiveNext && sequence < (receiveNext + receiveWindow);
255 }
256 
257 
258 static inline bool
259 segment_in_sequence(const tcp_segment_header& segment, int size,
260 	const tcp_sequence& receiveNext, uint32 receiveWindow)
261 {
262 	tcp_sequence sequence(segment.sequence);
263 	if (size == 0) {
264 		if (receiveWindow == 0)
265 			return sequence == receiveNext;
266 		return in_window(sequence, receiveNext, receiveWindow);
267 	} else {
268 		if (receiveWindow == 0)
269 			return false;
270 		return in_window(sequence, receiveNext, receiveWindow)
271 			|| in_window(sequence + size - 1, receiveNext, receiveWindow);
272 	}
273 }
274 
275 
276 static inline bool
277 is_writable(tcp_state state)
278 {
279 	return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED
280 		|| state == ESTABLISHED || state == FINISH_RECEIVED;
281 }
282 
283 
284 static inline uint32 tcp_now()
285 {
286 	return system_time() / kTimestampFactor;
287 }
288 
289 
290 static inline uint32 tcp_diff_timestamp(uint32 base)
291 {
292 	uint32 now = tcp_now();
293 
294 	if (now > base)
295 		return now - base;
296 
297 	return now + UINT_MAX - base;
298 }
299 
300 
301 static inline bool
302 state_needs_finish(int32 state)
303 {
304 	return state == WAIT_FOR_FINISH_ACKNOWLEDGE
305 		|| state == FINISH_SENT || state == CLOSING;
306 }
307 
308 
309 //	#pragma mark -
310 
311 
312 WaitList::WaitList(const char* name)
313 {
314 	fCondition = 0;
315 	fSem = create_sem(0, name);
316 }
317 
318 
319 WaitList::~WaitList()
320 {
321 	delete_sem(fSem);
322 }
323 
324 
325 status_t
326 WaitList::InitCheck() const
327 {
328 	return fSem;
329 }
330 
331 
332 status_t
333 WaitList::Wait(MutexLocker& locker, bigtime_t timeout)
334 {
335 	locker.Unlock();
336 
337 	status_t status = B_OK;
338 
339 	while (!atomic_test_and_set(&fCondition, 0, 1)) {
340 		status = acquire_sem_etc(fSem, 1, B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT,
341 			timeout);
342 		if (status != B_OK)
343 			break;
344 	}
345 
346 	locker.Lock();
347 	return status;
348 }
349 
350 
351 void
352 WaitList::Signal()
353 {
354 	atomic_or(&fCondition, 1);
355 #ifdef __HAIKU__
356 	release_sem_etc(fSem, 1, B_DO_NOT_RESCHEDULE | B_RELEASE_ALL);
357 #else
358 	int32 count;
359 	if (get_sem_count(fSem, &count) == B_OK && count < 0)
360 		release_sem_etc(fSem, -count, B_DO_NOT_RESCHEDULE);
361 #endif
362 }
363 
364 
365 //	#pragma mark -
366 
367 
368 TCPEndpoint::TCPEndpoint(net_socket* socket)
369 	: ProtocolSocket(socket),
370 	fManager(NULL),
371 	fReceiveList("tcp receive"),
372 	fSendList("tcp send"),
373 	fOptions(0),
374 	fSendWindowShift(0),
375 	fReceiveWindowShift(0),
376 	fSendUnacknowledged(0),
377 	fSendNext(0),
378 	fSendMax(0),
379 	fSendUrgentOffset(0),
380 	fSendWindow(0),
381 	fSendMaxWindow(0),
382 	fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
383 	fSendQueue(socket->send.buffer_size),
384 	fInitialSendSequence(0),
385 	fDuplicateAcknowledgeCount(0),
386 	fRoute(NULL),
387 	fReceiveNext(0),
388 	fReceiveMaxAdvertised(0),
389 	fReceiveWindow(socket->receive.buffer_size),
390 	fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE),
391 	fReceiveQueue(socket->receive.buffer_size),
392 	fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor),
393 	fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor),
394 	fRetransmitTimeout(TCP_INITIAL_RTT),
395 	fReceivedTimestamp(0),
396 	fCongestionWindow(0),
397 	fSlowStartThreshold(0),
398 	fState(CLOSED),
399 	fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP)
400 {
401 	// TODO: to be replaced with a real read/write locking strategy!
402 	mutex_init(&fLock, "tcp lock");
403 
404 	gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this);
405 	gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer,
406 		this);
407 	gStackModule->init_timer(&fDelayedAcknowledgeTimer,
408 		TCPEndpoint::_DelayedAcknowledgeTimer, this);
409 	gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer,
410 		this);
411 }
412 
413 
414 TCPEndpoint::~TCPEndpoint()
415 {
416 	mutex_lock(&fLock);
417 
418 	_CancelConnectionTimers();
419 	gStackModule->cancel_timer(&fTimeWaitTimer);
420 
421 	if (fManager != NULL) {
422 		fManager->Unbind(this);
423 		put_endpoint_manager(fManager);
424 	}
425 
426 	mutex_destroy(&fLock);
427 
428 	// we need to wait for all timers to return
429 	gStackModule->wait_for_timer(&fRetransmitTimer);
430 	gStackModule->wait_for_timer(&fPersistTimer);
431 	gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer);
432 	gStackModule->wait_for_timer(&fTimeWaitTimer);
433 }
434 
435 
436 status_t
437 TCPEndpoint::InitCheck() const
438 {
439 	if (fReceiveList.InitCheck() < B_OK)
440 		return fReceiveList.InitCheck();
441 
442 	if (fSendList.InitCheck() < B_OK)
443 		return fSendList.InitCheck();
444 
445 	return B_OK;
446 }
447 
448 
449 //	#pragma mark - protocol API
450 
451 
452 status_t
453 TCPEndpoint::Open()
454 {
455 	TRACE("Open()");
456 
457 	status_t status = ProtocolSocket::Open();
458 	if (status < B_OK)
459 		return status;
460 
461 	fManager = get_endpoint_manager(Domain());
462 	if (fManager == NULL)
463 		return EAFNOSUPPORT;
464 
465 	return B_OK;
466 }
467 
468 
469 status_t
470 TCPEndpoint::Close()
471 {
472 	TRACE("Close()");
473 
474 	MutexLocker locker(fLock);
475 
476 	if (fState == LISTEN)
477 		delete_sem(fAcceptSemaphore);
478 
479 	if (fState == SYNCHRONIZE_SENT || fState == LISTEN) {
480 		// TODO: what about linger in case of SYNCHRONIZE_SENT?
481 		fState = CLOSED;
482 		T(State(this));
483 		return B_OK;
484 	}
485 
486 	status_t status = _Disconnect(true);
487 	if (status != B_OK)
488 		return status;
489 
490 	if (socket->options & SO_LINGER) {
491 		TRACE("Close(): Lingering for %i secs", socket->linger);
492 
493 		bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL);
494 
495 		while (fSendQueue.Used() > 0) {
496 			status = fSendList.Wait(locker, maximum);
497 			if (status == B_TIMED_OUT || status == B_WOULD_BLOCK)
498 				break;
499 			else if (status < B_OK)
500 				return status;
501 		}
502 
503 		TRACE("Close(): after waiting, the SendQ was left with %lu bytes.",
504 			fSendQueue.Used());
505 	}
506 	return B_OK;
507 }
508 
509 
510 status_t
511 TCPEndpoint::Free()
512 {
513 	TRACE("Free()");
514 
515 	MutexLocker _(fLock);
516 
517 	if (fState <= SYNCHRONIZE_SENT)
518 		return B_OK;
519 
520 	// we are only interested in the timer, not in changing state
521 	_EnterTimeWait();
522 
523 	fFlags |= FLAG_CLOSED;
524 	if ((fFlags & FLAG_DELETE_ON_CLOSE) != 0)
525 		return B_OK;
526 
527 	return B_BUSY;
528 		// we'll be freed later when the 2MSL timer expires
529 }
530 
531 
532 /*!	Creates and sends a synchronize packet to /a address, and then waits
533 	until the connection has been established or refused.
534 */
535 status_t
536 TCPEndpoint::Connect(const sockaddr* address)
537 {
538 	TRACE("Connect() on address %s", PrintAddress(address));
539 
540 	MutexLocker locker(fLock);
541 
542 	if (gStackModule->is_restarted_syscall()) {
543 		bigtime_t timeout = gStackModule->restore_syscall_restart_timeout();
544 		status_t status = _WaitForEstablished(locker, timeout);
545 		TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
546 			strerror(status), timeout);
547 		return posix_error(status);
548 	}
549 
550 	// Can only call connect() from CLOSED or LISTEN states
551 	// otherwise endpoint is considered already connected
552 	if (fState == LISTEN) {
553 		// this socket is about to connect; remove pending connections in the backlog
554 		gSocketModule->set_max_backlog(socket, 0);
555 	} else if (fState == ESTABLISHED) {
556 		return EISCONN;
557 	} else if (fState != CLOSED)
558 		return EINPROGRESS;
559 
560 	// TODO: this is IPv4 specific, and doesn't belong here!
561 	// consider destination address INADDR_ANY as INADDR_LOOPBACK
562 	sockaddr_in _address;
563 	if (((sockaddr_in*)address)->sin_addr.s_addr == INADDR_ANY) {
564 		memcpy(&_address, address, sizeof(sockaddr_in));
565 		_address.sin_len = sizeof(sockaddr_in);
566 		_address.sin_addr.s_addr = INADDR_LOOPBACK;
567 		address = (sockaddr*)&_address;
568 	}
569 
570 	status_t status = _PrepareSendPath(address);
571 	if (status < B_OK)
572 		return status;
573 
574 	TRACE("  Connect(): starting 3-way handshake...");
575 
576 	fState = SYNCHRONIZE_SENT;
577 	T(State(this));
578 
579 	// send SYN
580 	status = _SendQueued();
581 	if (status != B_OK) {
582 		_Close();
583 		return status;
584 	}
585 
586 	// If we are running over Loopback, after _SendQueued() returns we
587 	// may be in ESTABLISHED already.
588 	if (fState == ESTABLISHED) {
589 		TRACE("  Connect() completed after _SendQueued()");
590 		return B_OK;
591 	}
592 
593 	// wait until 3-way handshake is complete (if needed)
594 	bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT);
595 	if (timeout == 0) {
596 		// we're a non-blocking socket
597 		TRACE("  Connect() delayed, return EINPROGRESS");
598 		return EINPROGRESS;
599 	}
600 
601 	bigtime_t absoluteTimeout = absolute_timeout(timeout);
602 	gStackModule->store_syscall_restart_timeout(absoluteTimeout);
603 
604 	status = _WaitForEstablished(locker, absoluteTimeout);
605 	TRACE("  Connect(): Connection complete: %s (timeout was %llu)",
606 		strerror(status), timeout);
607 	return posix_error(status);
608 }
609 
610 
611 status_t
612 TCPEndpoint::Accept(struct net_socket** _acceptedSocket)
613 {
614 	TRACE("Accept()");
615 
616 	MutexLocker locker(fLock);
617 
618 	status_t status;
619 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
620 	if (gStackModule->is_restarted_syscall())
621 		timeout = gStackModule->restore_syscall_restart_timeout();
622 	else
623 		gStackModule->store_syscall_restart_timeout(timeout);
624 
625 	do {
626 		locker.Unlock();
627 
628 		status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT
629 			| B_CAN_INTERRUPT, timeout);
630 		if (status < B_OK)
631 			return status;
632 
633 		locker.Lock();
634 		status = gSocketModule->dequeue_connected(socket, _acceptedSocket);
635 #ifdef TRACE_TCP
636 		if (status == B_OK)
637 			TRACE("  Accept() returning %p", (*_acceptedSocket)->first_protocol);
638 #endif
639 	} while (status < B_OK);
640 
641 	return status;
642 }
643 
644 
645 status_t
646 TCPEndpoint::Bind(const sockaddr *address)
647 {
648 	if (address == NULL)
649 		return B_BAD_VALUE;
650 
651 	MutexLocker lock(fLock);
652 
653 	TRACE("Bind() on address %s", PrintAddress(address));
654 
655 	if (fState != CLOSED)
656 		return EISCONN;
657 
658 	return fManager->Bind(this, address);
659 }
660 
661 
662 status_t
663 TCPEndpoint::Unbind(struct sockaddr *address)
664 {
665 	TRACE("Unbind()");
666 
667 	MutexLocker lock(fLock);
668 	return fManager->Unbind(this);
669 }
670 
671 
672 status_t
673 TCPEndpoint::Listen(int count)
674 {
675 	TRACE("Listen()");
676 
677 	MutexLocker lock(fLock);
678 
679 	if (fState != CLOSED)
680 		return B_BAD_VALUE;
681 
682 	fAcceptSemaphore = create_sem(0, "tcp accept");
683 	if (fAcceptSemaphore < B_OK)
684 		return ENOBUFS;
685 
686 	status_t status = fManager->SetPassive(this);
687 	if (status < B_OK) {
688 		delete_sem(fAcceptSemaphore);
689 		fAcceptSemaphore = -1;
690 		return status;
691 	}
692 
693 	fState = LISTEN;
694 	T(State(this));
695 	return B_OK;
696 }
697 
698 
699 status_t
700 TCPEndpoint::Shutdown(int direction)
701 {
702 	TRACE("Shutdown(%i)", direction);
703 
704 	MutexLocker lock(fLock);
705 
706 	if (direction == SHUT_RD || direction == SHUT_RDWR)
707 		fFlags |= FLAG_NO_RECEIVE;
708 
709 	if (direction == SHUT_WR || direction == SHUT_RDWR) {
710 		// TODO: That's not correct. After read/write shutting down the socket
711 		// one should still be able to read previously arrived data.
712 		_Disconnect(false);
713 	}
714 
715 	return B_OK;
716 }
717 
718 
719 /*!	Puts data contained in \a buffer into send buffer */
720 status_t
721 TCPEndpoint::SendData(net_buffer *buffer)
722 {
723 	MutexLocker lock(fLock);
724 
725 	TRACE("SendData(buffer %p, size %lu, flags %lx) [total %lu bytes, has %lu]",
726 		  buffer, buffer->size, buffer->flags, fSendQueue.Size(),
727 		  fSendQueue.Free());
728 
729 	if (fState == CLOSED)
730 		return ENOTCONN;
731 	if (fState == LISTEN)
732 		return EDESTADDRREQ;
733 	if (!is_writable(fState)) {
734 		// we only send signals when called from userland
735 		if (gStackModule->is_syscall)
736 			send_signal(find_thread(NULL), SIGPIPE);
737 		return EPIPE;
738 	}
739 
740 	uint32 flags = buffer->flags;
741 	size_t left = buffer->size;
742 
743 	bigtime_t timeout = absolute_timeout(socket->send.timeout);
744 	if (gStackModule->is_restarted_syscall())
745 		timeout = gStackModule->restore_syscall_restart_timeout();
746 	else
747 		gStackModule->store_syscall_restart_timeout(timeout);
748 
749 	while (left > 0) {
750 		while (fSendQueue.Free() < socket->send.low_water_mark) {
751 			// wait until enough space is available
752 			status_t status = fSendList.Wait(lock, timeout);
753 			if (status < B_OK) {
754 				TRACE("  SendData() returning %s (%d)",
755 					strerror(posix_error(status)), (int)posix_error(status));
756 				return posix_error(status);
757 			}
758 
759 			if (!is_writable(fState)) {
760 				// we only send signals when called from userland
761 				if (gStackModule->is_syscall)
762 					send_signal(find_thread(NULL), SIGPIPE);
763 				return EPIPE;
764 			}
765 		}
766 
767 		size_t size = fSendQueue.Free();
768 		if (size < left) {
769 			// we need to split the original buffer
770 			net_buffer* clone = gBufferModule->clone(buffer, false);
771 				// TODO: add offset/size parameter to net_buffer::clone() or
772 				// even a move_data() function, as this is a bit inefficient
773 			if (clone == NULL)
774 				return ENOBUFS;
775 
776 			status_t status = gBufferModule->trim(clone, size);
777 			if (status != B_OK) {
778 				gBufferModule->free(clone);
779 				return status;
780 			}
781 
782 			gBufferModule->remove_header(buffer, size);
783 			left -= size;
784 			fSendQueue.Add(clone);
785 		} else {
786 			left -= buffer->size;
787 			fSendQueue.Add(buffer);
788 		}
789 	}
790 
791 	TRACE("  SendData(): %lu bytes used.", fSendQueue.Used());
792 
793 	bool force = false;
794 	if ((flags & MSG_OOB) != 0) {
795 		fSendUrgentOffset = fSendQueue.LastSequence();
796 			// RFC 961 specifies that the urgent offset points to the last
797 			// byte of urgent data. However, this is commonly implemented as
798 			// here, ie. it points to the first byte after the urgent data.
799 		force = true;
800 	}
801 	if ((flags & MSG_EOF) != 0)
802 		_Disconnect(false);
803 
804 	if (fState == ESTABLISHED || fState == FINISH_RECEIVED)
805 		_SendQueued(force);
806 
807 	return B_OK;
808 }
809 
810 
811 ssize_t
812 TCPEndpoint::SendAvailable()
813 {
814 	MutexLocker locker(fLock);
815 
816 	ssize_t available;
817 
818 	if (is_writable(fState))
819 		available = fSendQueue.Free();
820 	else
821 		available = EPIPE;
822 
823 	TRACE("SendAvailable(): %li", available);
824 	return available;
825 }
826 
827 
828 status_t
829 TCPEndpoint::FillStat(net_stat *stat)
830 {
831 	MutexLocker _(fLock);
832 
833 	strlcpy(stat->state, name_for_state(fState), sizeof(stat->state));
834 	stat->receive_queue_size = fReceiveQueue.Available();
835 	stat->send_queue_size = fSendQueue.Used();
836 
837 	return B_OK;
838 }
839 
840 
841 status_t
842 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer)
843 {
844 	TRACE("ReadData(%lu bytes, flags 0x%x)", numBytes, (unsigned int)flags);
845 
846 	MutexLocker locker(fLock);
847 
848 	*_buffer = NULL;
849 
850 	if (fState == CLOSED)
851 		return ENOTCONN;
852 
853 	bigtime_t timeout = absolute_timeout(socket->receive.timeout);
854 	if (gStackModule->is_restarted_syscall())
855 		timeout = gStackModule->restore_syscall_restart_timeout();
856 	else
857 		gStackModule->store_syscall_restart_timeout(timeout);
858 
859 	if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) {
860 		if (flags & MSG_DONTWAIT)
861 			return B_WOULD_BLOCK;
862 
863 		status_t status = _WaitForEstablished(locker, timeout);
864 		if (status < B_OK)
865 			return posix_error(status);
866 	}
867 
868 	size_t dataNeeded = socket->receive.low_water_mark;
869 
870 	// When MSG_WAITALL is set then the function should block
871 	// until the full amount of data can be returned.
872 	if (flags & MSG_WAITALL)
873 		dataNeeded = numBytes;
874 
875 	// TODO: add support for urgent data (MSG_OOB)
876 
877 	while (true) {
878 		if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE
879 			|| fState == TIME_WAIT) {
880 			// ``Connection closing''.
881 			return B_OK;
882 		}
883 
884 		if (fReceiveQueue.Available() > 0) {
885 			if (fReceiveQueue.Available() >= dataNeeded
886 				|| (fReceiveQueue.PushedData() > 0
887 					&& fReceiveQueue.PushedData() >= fReceiveQueue.Available()))
888 				break;
889 		} else if (fState == FINISH_RECEIVED) {
890 			// ``If no text is awaiting delivery, the RECEIVE will
891 			//   get a Connection closing''.
892 			return B_OK;
893 		}
894 
895 		if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0)
896 			return B_WOULD_BLOCK;
897 
898 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
899 			return B_OK;
900 
901 		status_t status = fReceiveList.Wait(locker, timeout);
902 		if (status < B_OK) {
903 			// The Open Group base specification mentions that EINTR should be
904 			// returned if the recv() is interrupted before _any data_ is
905 			// available. So we actually check if there is data, and if so,
906 			// push it to the user.
907 			if ((status == B_TIMED_OUT || status == B_INTERRUPTED)
908 				&& fReceiveQueue.Available() > 0)
909 				break;
910 
911 			return posix_error(status);
912 		}
913 	}
914 
915 	TRACE("  ReadData(): %lu are available.", fReceiveQueue.Available());
916 
917 	if (numBytes < fReceiveQueue.Available())
918 		fReceiveList.Signal();
919 
920 	bool clone = (flags & MSG_PEEK) != 0;
921 
922 	ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer);
923 
924 	TRACE("  ReadData(): %lu bytes kept.", fReceiveQueue.Available());
925 
926 	// if we are opening the window, check if we should send an ACK
927 	if (!clone)
928 		SendAcknowledge(false);
929 
930 	return receivedBytes;
931 }
932 
933 
934 ssize_t
935 TCPEndpoint::ReadAvailable()
936 {
937 	MutexLocker locker(fLock);
938 
939 	TRACE("ReadAvailable(): %li", _AvailableData());
940 
941 	return _AvailableData();
942 }
943 
944 
945 status_t
946 TCPEndpoint::SetSendBufferSize(size_t length)
947 {
948 	MutexLocker _(fLock);
949 	fSendQueue.SetMaxBytes(length);
950 	return B_OK;
951 }
952 
953 
954 status_t
955 TCPEndpoint::SetReceiveBufferSize(size_t length)
956 {
957 	MutexLocker _(fLock);
958 	fReceiveQueue.SetMaxBytes(length);
959 	return B_OK;
960 }
961 
962 
963 status_t
964 TCPEndpoint::GetOption(int option, void* _value, int* _length)
965 {
966 	if (*_length != sizeof(int))
967 		return B_BAD_VALUE;
968 
969 	int* value = (int*)_value;
970 
971 	switch (option) {
972 		case TCP_NODELAY:
973 			if ((fOptions & TCP_NODELAY) != 0)
974 				*value = 1;
975 			else
976 				*value = 0;
977 			return B_OK;
978 
979 		case TCP_MAXSEG:
980 			*value = fReceiveMaxSegmentSize;
981 			return B_OK;
982 
983 		default:
984 			return B_BAD_VALUE;
985 	}
986 }
987 
988 
989 status_t
990 TCPEndpoint::SetOption(int option, const void* _value, int length)
991 {
992 	if (option != TCP_NODELAY)
993 		return B_BAD_VALUE;
994 
995 	if (length != sizeof(int))
996 		return B_BAD_VALUE;
997 
998 	const int* value = (const int*)_value;
999 
1000 	MutexLocker _(fLock);
1001 	if (*value)
1002 		fOptions |= TCP_NODELAY;
1003 	else
1004 		fOptions &= ~TCP_NODELAY;
1005 
1006 	return B_OK;
1007 }
1008 
1009 
1010 //	#pragma mark - misc
1011 
1012 
1013 bool
1014 TCPEndpoint::IsBound() const
1015 {
1016 	return !LocalAddress().IsEmpty(true);
1017 }
1018 
1019 
1020 bool
1021 TCPEndpoint::IsLocal() const
1022 {
1023 	return (fFlags & FLAG_LOCAL) != 0;
1024 }
1025 
1026 
1027 status_t
1028 TCPEndpoint::DelayedAcknowledge()
1029 {
1030 	if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) {
1031 		// timer was active, send an ACK now (with the exception above,
1032 		// we send every other ACK)
1033 		return SendAcknowledge(true);
1034 	}
1035 
1036 	gStackModule->set_timer(&fDelayedAcknowledgeTimer,
1037 		TCP_DELAYED_ACKNOWLEDGE_TIMEOUT);
1038 	return B_OK;
1039 }
1040 
1041 
1042 status_t
1043 TCPEndpoint::SendAcknowledge(bool force)
1044 {
1045 	return _SendQueued(force, 0);
1046 }
1047 
1048 
1049 void
1050 TCPEndpoint::_StartPersistTimer()
1051 {
1052 	gStackModule->set_timer(&fPersistTimer, 1000000LL);
1053 }
1054 
1055 
1056 void
1057 TCPEndpoint::_EnterTimeWait()
1058 {
1059 	TRACE("_EnterTimeWait()\n");
1060 
1061 	_CancelConnectionTimers();
1062 
1063 	if (fState == TIME_WAIT && IsLocal()) {
1064 		// we do not use TIME_WAIT state for local connections
1065 		fFlags |= FLAG_DELETE_ON_CLOSE;
1066 		return;
1067 	}
1068 
1069 	_UpdateTimeWait();
1070 }
1071 
1072 
1073 void
1074 TCPEndpoint::_UpdateTimeWait()
1075 {
1076 	gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1);
1077 }
1078 
1079 
1080 void
1081 TCPEndpoint::_CancelConnectionTimers()
1082 {
1083 	gStackModule->cancel_timer(&fRetransmitTimer);
1084 	gStackModule->cancel_timer(&fPersistTimer);
1085 	gStackModule->cancel_timer(&fDelayedAcknowledgeTimer);
1086 }
1087 
1088 
1089 /*!	Sends the FIN flag to the peer when the connection is still open.
1090 	Moves the endpoint to the next state depending on where it was.
1091 */
1092 status_t
1093 TCPEndpoint::_Disconnect(bool closing)
1094 {
1095 	tcp_state previousState = fState;
1096 
1097 	if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED)
1098 		fState = FINISH_SENT;
1099 	else if (fState == FINISH_RECEIVED)
1100 		fState = WAIT_FOR_FINISH_ACKNOWLEDGE;
1101 	else
1102 		return B_OK;
1103 
1104 	T(State(this));
1105 
1106 	status_t status = _SendQueued();
1107 	if (status != B_OK) {
1108 		fState = previousState;
1109 		T(State(this));
1110 		return status;
1111 	}
1112 
1113 	return B_OK;
1114 }
1115 
1116 
1117 void
1118 TCPEndpoint::_MarkEstablished()
1119 {
1120 	fState = ESTABLISHED;
1121 	T(State(this));
1122 
1123 	if (socket->parent != NULL) {
1124 		gSocketModule->set_connected(socket);
1125 		release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE);
1126 	}
1127 
1128 	fSendList.Signal();
1129 }
1130 
1131 
1132 status_t
1133 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout)
1134 {
1135 // TODO: Checking for CLOSED seems correct, but breaks several neon tests.
1136 // When investigating this, also have a look at _Close() and _HandleReset().
1137 	while (fState < ESTABLISHED/* && fState != CLOSED*/) {
1138 		if (socket->error != B_OK)
1139 			return socket->error;
1140 
1141 		status_t status = fSendList.Wait(locker, timeout);
1142 		if (status < B_OK)
1143 			return status;
1144 	}
1145 
1146 	return B_OK;
1147 }
1148 
1149 
1150 //	#pragma mark - receive
1151 
1152 
1153 void
1154 TCPEndpoint::_Close()
1155 {
1156 	_CancelConnectionTimers();
1157 	fState = CLOSED;
1158 	T(State(this));
1159 
1160 	fFlags |= FLAG_DELETE_ON_CLOSE;
1161 
1162 	fSendList.Signal();
1163 	_NotifyReader();
1164 }
1165 
1166 
1167 void
1168 TCPEndpoint::_HandleReset(status_t error)
1169 {
1170 	socket->error = error;
1171 	_Close();
1172 
1173 	gSocketModule->notify(socket, B_SELECT_WRITE, error);
1174 	gSocketModule->notify(socket, B_SELECT_ERROR, error);
1175 }
1176 
1177 
1178 void
1179 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment)
1180 {
1181 	if (++fDuplicateAcknowledgeCount < 3)
1182 		return;
1183 
1184 	if (fDuplicateAcknowledgeCount == 3) {
1185 		_ResetSlowStart();
1186 		fCongestionWindow = fSlowStartThreshold + 3
1187 			* fSendMaxSegmentSize;
1188 		fSendNext = segment.acknowledge;
1189 	} else if (fDuplicateAcknowledgeCount > 3)
1190 		fCongestionWindow += fSendMaxSegmentSize;
1191 
1192 	_SendQueued();
1193 }
1194 
1195 
1196 void
1197 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment,
1198 	size_t segmentLength)
1199 {
1200 	if (fFlags & FLAG_OPTION_TIMESTAMP) {
1201 		tcp_sequence sequence(segment.sequence);
1202 
1203 		if (fLastAcknowledgeSent >= sequence
1204 			&& fLastAcknowledgeSent < (sequence + segmentLength))
1205 			fReceivedTimestamp = segment.timestamp_value;
1206 	}
1207 }
1208 
1209 
1210 ssize_t
1211 TCPEndpoint::_AvailableData() const
1212 {
1213 	// TODO: Refer to the FLAG_NO_RECEIVE comment above regarding
1214 	//       the application of FLAG_NO_RECEIVE in listen()ing
1215 	//       sockets.
1216 	if (fState == LISTEN)
1217 		return gSocketModule->count_connected(socket);
1218 	if (fState == SYNCHRONIZE_SENT)
1219 		return 0;
1220 
1221 	ssize_t availableData = fReceiveQueue.Available();
1222 
1223 	if (availableData == 0 && !_ShouldReceive())
1224 		return ENOTCONN;
1225 
1226 	return availableData;
1227 }
1228 
1229 
1230 void
1231 TCPEndpoint::_NotifyReader()
1232 {
1233 	fReceiveList.Signal();
1234 	gSocketModule->notify(socket, B_SELECT_READ, _AvailableData());
1235 }
1236 
1237 
1238 bool
1239 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer)
1240 {
1241 	fReceiveQueue.Add(buffer, segment.sequence);
1242 	fReceiveNext = fReceiveQueue.NextSequence();
1243 
1244 	TRACE("  _AddData(): adding data, receive next = %lu. Now have %lu bytes.",
1245 		(uint32)fReceiveNext, fReceiveQueue.Available());
1246 
1247 	if (segment.flags & TCP_FLAG_PUSH)
1248 		fReceiveQueue.SetPushPointer();
1249 
1250 	return fReceiveQueue.Available() > 0;
1251 }
1252 
1253 
1254 void
1255 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment)
1256 {
1257 	fInitialReceiveSequence = segment.sequence;
1258 
1259 	// count the received SYN
1260 	segment.sequence++;
1261 
1262 	fReceiveNext = segment.sequence;
1263 	fReceiveQueue.SetInitialSequence(segment.sequence);
1264 
1265 	if ((fOptions & TCP_NOOPT) == 0) {
1266 		if (segment.max_segment_size > 0)
1267 			fSendMaxSegmentSize = segment.max_segment_size;
1268 
1269 		if (segment.options & TCP_HAS_WINDOW_SCALE) {
1270 			fFlags |= FLAG_OPTION_WINDOW_SCALE;
1271 			fSendWindowShift = segment.window_shift;
1272 		} else {
1273 			fFlags &= ~FLAG_OPTION_WINDOW_SCALE;
1274 			fReceiveWindowShift = 0;
1275 		}
1276 
1277 		if (segment.options & TCP_HAS_TIMESTAMPS) {
1278 			fFlags |= FLAG_OPTION_TIMESTAMP;
1279 			fReceivedTimestamp = segment.timestamp_value;
1280 		} else
1281 			fFlags &= ~FLAG_OPTION_TIMESTAMP;
1282 	}
1283 
1284 	fCongestionWindow = 2 * fSendMaxSegmentSize;
1285 	fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift;
1286 }
1287 
1288 
1289 bool
1290 TCPEndpoint::_ShouldReceive() const
1291 {
1292 	if ((fFlags & FLAG_NO_RECEIVE) != 0)
1293 		return false;
1294 
1295 	return fState == ESTABLISHED || fState == FINISH_SENT
1296 		|| fState == FINISH_ACKNOWLEDGED;
1297 }
1298 
1299 
1300 int32
1301 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment,
1302 	net_buffer* buffer)
1303 {
1304 	MutexLocker _(fLock);
1305 
1306 	// TODO error checking
1307 	ProtocolSocket::Open();
1308 
1309 	fState = SYNCHRONIZE_RECEIVED;
1310 	T(State(this));
1311 	fManager = parent->fManager;
1312 
1313 	LocalAddress().SetTo(buffer->destination);
1314 	PeerAddress().SetTo(buffer->source);
1315 
1316 	TRACE("Spawn()");
1317 
1318 	// TODO: proper error handling!
1319 	if (fManager->BindChild(this) < B_OK)
1320 		return DROP;
1321 
1322 	if (_PrepareSendPath(*PeerAddress()) < B_OK)
1323 		return DROP;
1324 
1325 	fOptions = parent->fOptions;
1326 	fAcceptSemaphore = parent->fAcceptSemaphore;
1327 
1328 	_PrepareReceivePath(segment);
1329 
1330 	// send SYN+ACK
1331 	if (_SendQueued() < B_OK)
1332 		return DROP;
1333 
1334 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1335 		// we handled this flag now, it must not be set for further processing
1336 
1337 	return _Receive(segment, buffer);
1338 }
1339 
1340 
1341 int32
1342 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer)
1343 {
1344 	TRACE("ListenReceive()");
1345 
1346 	// Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state,
1347 	// but the error behaviour differs
1348 	if (segment.flags & TCP_FLAG_RESET)
1349 		return DROP;
1350 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
1351 		return DROP | RESET;
1352 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1353 		return DROP;
1354 
1355 	// TODO: drop broadcast/multicast
1356 
1357 	// spawn new endpoint for accept()
1358 	net_socket* newSocket;
1359 	if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK)
1360 		return DROP;
1361 
1362 	return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this,
1363 		segment, buffer);
1364 }
1365 
1366 
1367 int32
1368 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment,
1369 	net_buffer *buffer)
1370 {
1371 	TRACE("_SynchronizeSentReceive()");
1372 
1373 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1374 		&& (fInitialSendSequence >= segment.acknowledge
1375 			|| fSendMax < segment.acknowledge))
1376 		return DROP | RESET;
1377 
1378 	if (segment.flags & TCP_FLAG_RESET) {
1379 		_HandleReset(ECONNREFUSED);
1380 		return DROP;
1381 	}
1382 
1383 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0)
1384 		return DROP;
1385 
1386 	fSendUnacknowledged = segment.acknowledge;
1387 	_PrepareReceivePath(segment);
1388 
1389 	if (segment.flags & TCP_FLAG_ACKNOWLEDGE) {
1390 		_MarkEstablished();
1391 	} else {
1392 		// simultaneous open
1393 		fState = SYNCHRONIZE_RECEIVED;
1394 		T(State(this));
1395 	}
1396 
1397 	segment.flags &= ~TCP_FLAG_SYNCHRONIZE;
1398 		// we handled this flag now, it must not be set for further processing
1399 
1400 	return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE;
1401 }
1402 
1403 
1404 int32
1405 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer)
1406 {
1407 	uint32 advertisedWindow = (uint32)segment.advertised_window
1408 		<< fSendWindowShift;
1409 	size_t segmentLength = buffer->size;
1410 
1411 	// First, handle the most common case for uni-directional data transfer
1412 	// (known as header prediction - the segment must not change the window,
1413 	// and must be the expected sequence, and contain no control flags)
1414 
1415 	if (fState == ESTABLISHED
1416 		&& segment.AcknowledgeOnly()
1417 		&& fReceiveNext == segment.sequence
1418 		&& advertisedWindow > 0 && advertisedWindow == fSendWindow
1419 		&& fSendNext == fSendMax) {
1420 		_UpdateTimestamps(segment, segmentLength);
1421 
1422 		if (segmentLength == 0) {
1423 			// this is a pure acknowledge segment - we're on the sending end
1424 			if (fSendUnacknowledged < segment.acknowledge
1425 				&& fSendMax >= segment.acknowledge) {
1426 				_Acknowledged(segment);
1427 				return DROP;
1428 			}
1429 		} else if (segment.acknowledge == fSendUnacknowledged
1430 			&& fReceiveQueue.IsContiguous()
1431 			&& fReceiveQueue.Free() >= segmentLength
1432 			&& (fFlags & FLAG_NO_RECEIVE) == 0) {
1433 			if (_AddData(segment, buffer))
1434 				_NotifyReader();
1435 
1436 			return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0
1437 				? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE);
1438 		}
1439 	}
1440 
1441 	// The fast path was not applicable, so we continue with the standard
1442 	// processing of the incoming segment
1443 
1444 	ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN);
1445 
1446 	if (fState != CLOSED && fState != TIME_WAIT) {
1447 		// Check sequence number
1448 		if (!segment_in_sequence(segment, segmentLength, fReceiveNext,
1449 				fReceiveWindow)) {
1450 			TRACE("  Receive(): segment out of window, next: %lu wnd: %lu",
1451 				(uint32)fReceiveNext, fReceiveWindow);
1452 			if (segment.flags & TCP_FLAG_RESET) {
1453 				// TODO: this doesn't look right - review!
1454 				return DROP;
1455 			}
1456 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1457 		}
1458 	}
1459 
1460 	if (segment.flags & TCP_FLAG_RESET) {
1461 		// Is this a valid reset?
1462 		// We generally ignore resets in time wait state (see RFC 1337)
1463 		if (fLastAcknowledgeSent <= segment.sequence
1464 			&& tcp_sequence(segment.sequence) < (fLastAcknowledgeSent
1465 				+ fReceiveWindow)
1466 			&& fState != TIME_WAIT) {
1467 			status_t error;
1468 			if (fState == SYNCHRONIZE_RECEIVED)
1469 				error = ECONNREFUSED;
1470 			else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE)
1471 				error = ENOTCONN;
1472 			else
1473 				error = ECONNRESET;
1474 
1475 			_HandleReset(error);
1476 		}
1477 
1478 		return DROP;
1479 	}
1480 
1481 	if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1482 		|| (fState == SYNCHRONIZE_RECEIVED
1483 			&& (fInitialReceiveSequence > segment.sequence
1484 				|| (segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0
1485 					&& (fSendUnacknowledged > segment.acknowledge
1486 						|| fSendMax < segment.acknowledge)))) {
1487 		// reset the connection - either the initial SYN was faulty, or we
1488 		// received a SYN within the data stream
1489 		return DROP | RESET;
1490 	}
1491 
1492 	fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow);
1493 		// the window must not shrink
1494 
1495 	// trim buffer to be within the receive window
1496 	int32 drop = fReceiveNext - segment.sequence;
1497 	if (drop > 0) {
1498 		if ((uint32)drop > buffer->size
1499 			|| ((uint32)drop == buffer->size
1500 				&& (segment.flags & TCP_FLAG_FINISH) == 0)) {
1501 			// don't accidently remove a FIN we shouldn't remove
1502 			segment.flags &= ~TCP_FLAG_FINISH;
1503 			drop = buffer->size;
1504 		}
1505 
1506 		// remove duplicate data at the start
1507 		TRACE("* remove %ld bytes from the start", drop);
1508 		gBufferModule->remove_header(buffer, drop);
1509 		segment.sequence += drop;
1510 	}
1511 
1512 	int32 action = KEEP;
1513 
1514 	drop = segment.sequence + buffer->size - (fReceiveNext + fReceiveWindow);
1515 	if (drop > 0) {
1516 		// remove data exceeding our window
1517 		if ((uint32)drop >= buffer->size) {
1518 			// if we can accept data, or the segment is not what we'd expect,
1519 			// drop the segment (an immediate acknowledge is always triggered)
1520 			if (fReceiveWindow != 0 || segment.sequence != fReceiveNext)
1521 				return DROP | IMMEDIATE_ACKNOWLEDGE;
1522 
1523 			action |= IMMEDIATE_ACKNOWLEDGE;
1524 		}
1525 
1526 		if ((segment.flags & TCP_FLAG_FINISH) != 0) {
1527 			// we need to remove the finish, too, as part of the data
1528 			drop--;
1529 		}
1530 
1531 		segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH);
1532 		TRACE("* remove %ld bytes from the end", drop);
1533 		gBufferModule->remove_trailer(buffer, drop);
1534 	}
1535 
1536 #ifdef TRACE_TCP
1537 	if (advertisedWindow > fSendWindow) {
1538 		TRACE("  Receive(): Window update %lu -> %lu", fSendWindow,
1539 			advertisedWindow);
1540 	}
1541 #endif
1542 
1543 	fSendWindow = advertisedWindow;
1544 	if (advertisedWindow > fSendMaxWindow)
1545 		fSendMaxWindow = advertisedWindow;
1546 
1547 	// Then look at the acknowledgement for any updates
1548 
1549 	if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) {
1550 		// process acknowledged data
1551 		if (fState == SYNCHRONIZE_RECEIVED)
1552 			_MarkEstablished();
1553 
1554 		if (fSendMax < segment.acknowledge)
1555 			return DROP | IMMEDIATE_ACKNOWLEDGE;
1556 
1557 		if (segment.acknowledge < fSendUnacknowledged) {
1558 			if (buffer->size == 0 && advertisedWindow == fSendWindow
1559 				&& (segment.flags & TCP_FLAG_FINISH) == 0) {
1560 				TRACE("Receive(): duplicate ack!");
1561 
1562 				_DuplicateAcknowledge(segment);
1563 			}
1564 
1565 			return DROP;
1566 		} else {
1567 			// this segment acknowledges in flight data
1568 
1569 			if (fDuplicateAcknowledgeCount >= 3) {
1570 				// deflate the window.
1571 				fCongestionWindow = fSlowStartThreshold;
1572 			}
1573 
1574 			fDuplicateAcknowledgeCount = 0;
1575 
1576 			if (fSendMax == segment.acknowledge)
1577 				TRACE("Receive(): all inflight data ack'd!");
1578 
1579 			if (segment.acknowledge > fSendQueue.LastSequence()
1580 					&& fState > ESTABLISHED) {
1581 				TRACE("Receive(): FIN has been acknowledged!");
1582 
1583 				switch (fState) {
1584 					case FINISH_SENT:
1585 						fState = FINISH_ACKNOWLEDGED;
1586 						T(State(this));
1587 						break;
1588 					case CLOSING:
1589 						fState = TIME_WAIT;
1590 						T(State(this));
1591 						_EnterTimeWait();
1592 						return DROP;
1593 					case WAIT_FOR_FINISH_ACKNOWLEDGE:
1594 						_Close();
1595 						break;
1596 
1597 					default:
1598 						break;
1599 				}
1600 			}
1601 
1602 			if (fState != CLOSED)
1603 				_Acknowledged(segment);
1604 		}
1605 	}
1606 
1607 	if (segment.flags & TCP_FLAG_URGENT) {
1608 		if (fState == ESTABLISHED || fState == FINISH_SENT
1609 			|| fState == FINISH_ACKNOWLEDGED) {
1610 			// TODO: Handle urgent data:
1611 			//  - RCV.UP <- max(RCV.UP, SEG.UP)
1612 			//  - signal the user that urgent data is available (SIGURG)
1613 		}
1614 	}
1615 
1616 	bool notify = false;
1617 
1618 	if (buffer->size > 0 &&	_ShouldReceive())
1619 		notify = _AddData(segment, buffer);
1620 	else {
1621 		if ((fFlags & FLAG_NO_RECEIVE) != 0)
1622 			fReceiveNext += buffer->size;
1623 
1624 		action = (action & ~KEEP) | DROP;
1625 	}
1626 
1627 	if (segment.flags & TCP_FLAG_FINISH) {
1628 		segmentLength++;
1629 		if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) {
1630 			TRACE("Receive(): peer is finishing connection!");
1631 			fReceiveNext++;
1632 			notify = true;
1633 
1634 			// FIN implies PUSH
1635 			fReceiveQueue.SetPushPointer();
1636 
1637 			// we'll reply immediately to the FIN if we are not
1638 			// transitioning to TIME WAIT so we immediatly ACK it.
1639 			action |= IMMEDIATE_ACKNOWLEDGE;
1640 
1641 			// other side is closing connection; change states
1642 			switch (fState) {
1643 				case ESTABLISHED:
1644 				case SYNCHRONIZE_RECEIVED:
1645 					fState = FINISH_RECEIVED;
1646 					T(State(this));
1647 					break;
1648 				case FINISH_SENT:
1649 					// simultaneous close
1650 					fState = CLOSING;
1651 					T(State(this));
1652 					break;
1653 				case FINISH_ACKNOWLEDGED:
1654 					fState = TIME_WAIT;
1655 					T(State(this));
1656 					_EnterTimeWait();
1657 					break;
1658 				case TIME_WAIT:
1659 					_UpdateTimeWait();
1660 					break;
1661 
1662 				default:
1663 					break;
1664 			}
1665 		}
1666 	}
1667 
1668 	if (notify)
1669 		_NotifyReader();
1670 
1671 	if (buffer->size > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0)
1672 		action |= ACKNOWLEDGE;
1673 
1674 	_UpdateTimestamps(segment, segmentLength);
1675 
1676 	TRACE("Receive() Action %ld", action);
1677 
1678 	return action;
1679 }
1680 
1681 
1682 int32
1683 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer)
1684 {
1685 	MutexLocker locker(fLock);
1686 
1687 	TRACE("SegmentReceived(): buffer %p (%lu bytes) address %s to %s\n"
1688 		"\tflags 0x%x, seq %lu, ack %lu, wnd %lu",
1689 		buffer, buffer->size, PrintAddress(buffer->source),
1690 		PrintAddress(buffer->destination), segment.flags, segment.sequence,
1691 		segment.acknowledge,
1692 		(uint32)segment.advertised_window << fSendWindowShift);
1693 	T(Receive(this, segment,
1694 		(uint32)segment.advertised_window << fSendWindowShift, buffer));
1695 	int32 segmentAction = DROP;
1696 
1697 	switch (fState) {
1698 		case LISTEN:
1699 			segmentAction = _ListenReceive(segment, buffer);
1700 			break;
1701 
1702 		case SYNCHRONIZE_SENT:
1703 			segmentAction = _SynchronizeSentReceive(segment, buffer);
1704 			break;
1705 
1706 		case SYNCHRONIZE_RECEIVED:
1707 		case ESTABLISHED:
1708 		case FINISH_RECEIVED:
1709 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1710 		case FINISH_SENT:
1711 		case FINISH_ACKNOWLEDGED:
1712 		case CLOSING:
1713 		case TIME_WAIT:
1714 		case CLOSED:
1715 			segmentAction = _Receive(segment, buffer);
1716 			break;
1717 	}
1718 
1719 	// process acknowledge action as asked for by the *Receive() method
1720 	if (segmentAction & IMMEDIATE_ACKNOWLEDGE)
1721 		SendAcknowledge(true);
1722 	else if (segmentAction & ACKNOWLEDGE)
1723 		DelayedAcknowledge();
1724 
1725 	if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE))
1726 			== (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) {
1727 		locker.Unlock();
1728 		gSocketModule->delete_socket(socket);
1729 			// this will also delete us
1730 	}
1731 
1732 	return segmentAction;
1733 }
1734 
1735 
1736 //	#pragma mark - send
1737 
1738 
1739 inline uint8
1740 TCPEndpoint::_CurrentFlags()
1741 {
1742 	// we don't set FLAG_FINISH here, instead we do it
1743 	// conditionally below depending if we are sending
1744 	// the last bytes of the send queue.
1745 
1746 	switch (fState) {
1747 		case CLOSED:
1748 			return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE;
1749 
1750 		case SYNCHRONIZE_SENT:
1751 			return TCP_FLAG_SYNCHRONIZE;
1752 		case SYNCHRONIZE_RECEIVED:
1753 			return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE;
1754 
1755 		case ESTABLISHED:
1756 		case FINISH_RECEIVED:
1757 		case FINISH_ACKNOWLEDGED:
1758 		case TIME_WAIT:
1759 		case WAIT_FOR_FINISH_ACKNOWLEDGE:
1760 		case FINISH_SENT:
1761 		case CLOSING:
1762 			return TCP_FLAG_ACKNOWLEDGE;
1763 
1764 		default:
1765 			return 0;
1766 	}
1767 }
1768 
1769 
1770 inline bool
1771 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length,
1772 	uint32 segmentMaxSize, uint32 flightSize)
1773 {
1774 	if (length > 0) {
1775 		// Avoid the silly window syndrome - we only send a segment in case:
1776 		// - we have a full segment to send, or
1777 		// - we're at the end of our buffer queue, or
1778 		// - the buffer is at least larger than half of the maximum send window,
1779 		//   or
1780 		// - we're retransmitting data
1781 		if (length == segmentMaxSize
1782 			|| (fOptions & TCP_NODELAY) != 0
1783 			|| tcp_sequence(fSendNext + length) == fSendQueue.LastSequence()
1784 			|| (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2))
1785 			return true;
1786 	}
1787 
1788 	// check if we need to send a window update to the peer
1789 	if (segment.advertised_window > 0) {
1790 		// correct the window to take into account what already has been advertised
1791 		uint32 window = (segment.advertised_window << fReceiveWindowShift)
1792 			- (fReceiveMaxAdvertised - fReceiveNext);
1793 
1794 		// if we can advertise a window larger than twice the maximum segment
1795 		// size, or half the maximum buffer size we send a window update
1796 		if (window >= (fReceiveMaxSegmentSize << 1)
1797 			|| window >= (socket->receive.buffer_size >> 1))
1798 			return true;
1799 	}
1800 
1801 	if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH
1802 			| TCP_FLAG_RESET)) != 0)
1803 		return true;
1804 
1805 	// We do have urgent data pending
1806 	if (fSendUrgentOffset > fSendNext)
1807 		return true;
1808 
1809 	// there is no reason to send a segment just now
1810 	return false;
1811 }
1812 
1813 
1814 status_t
1815 TCPEndpoint::_SendQueued(bool force)
1816 {
1817 	return _SendQueued(force, fSendWindow);
1818 }
1819 
1820 
1821 /*!	Sends one or more TCP segments with the data waiting in the queue, or some
1822 	specific flags that need to be sent.
1823 */
1824 status_t
1825 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow)
1826 {
1827 	if (fRoute == NULL)
1828 		return B_ERROR;
1829 
1830 	// in passive state?
1831 	if (fState == LISTEN)
1832 		return B_ERROR;
1833 
1834 	tcp_segment_header segment(_CurrentFlags());
1835 
1836 	if ((fOptions & TCP_NOOPT) == 0) {
1837 		if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) {
1838 			segment.options |= TCP_HAS_TIMESTAMPS;
1839 			segment.timestamp_reply = fReceivedTimestamp;
1840 			segment.timestamp_value = tcp_now();
1841 		}
1842 
1843 		if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0
1844 			&& fSendNext == fInitialSendSequence) {
1845 			// add connection establishment options
1846 			segment.max_segment_size = fReceiveMaxSegmentSize;
1847 			if (fFlags & FLAG_OPTION_WINDOW_SCALE) {
1848 				segment.options |= TCP_HAS_WINDOW_SCALE;
1849 				segment.window_shift = fReceiveWindowShift;
1850 			}
1851 		}
1852 	}
1853 
1854 	size_t availableBytes = fReceiveQueue.Free();
1855 	if (fFlags & FLAG_OPTION_WINDOW_SCALE)
1856 		segment.advertised_window = availableBytes >> fReceiveWindowShift;
1857 	else
1858 		segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes);
1859 
1860 	segment.acknowledge = fReceiveNext;
1861 
1862 	// Process urgent data
1863 	if (fSendUrgentOffset > fSendNext) {
1864 		segment.flags |= TCP_FLAG_URGENT;
1865 		segment.urgent_offset = fSendUrgentOffset - fSendNext;
1866 	} else {
1867 		fSendUrgentOffset = fSendUnacknowledged;
1868 			// Keep urgent offset updated, so that it doesn't reach into our
1869 			// send window on overlap
1870 		segment.urgent_offset = 0;
1871 	}
1872 
1873 	if (fCongestionWindow > 0 && fCongestionWindow < sendWindow)
1874 		sendWindow = fCongestionWindow;
1875 
1876 	// fSendUnacknowledged
1877 	//  |    fSendNext      fSendMax
1878 	//  |        |              |
1879 	//  v        v              v
1880 	//  -----------------------------------
1881 	//  | effective window           |
1882 	//  -----------------------------------
1883 
1884 	// Flight size represents the window of data which is currently in the
1885 	// ether. We should never send data such as the flight size becomes larger
1886 	// than the effective window. Note however that the effective window may be
1887 	// reduced (by congestion for instance), so at some point in time flight
1888 	// size may be larger than the currently calculated window.
1889 
1890 	uint32 flightSize = fSendMax - fSendUnacknowledged;
1891 	uint32 consumedWindow = fSendNext - fSendUnacknowledged;
1892 
1893 	if (consumedWindow > sendWindow) {
1894 		sendWindow = 0;
1895 		// TODO enter persist state? try to get a window update.
1896 	} else
1897 		sendWindow -= consumedWindow;
1898 
1899 	if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) {
1900 		// send one byte of data to ask for a window update
1901 		// (triggered by the persist timer)
1902 		sendWindow = 1;
1903 	}
1904 
1905 	uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow);
1906 	tcp_sequence previousSendNext = fSendNext;
1907 
1908 	do {
1909 		uint32 segmentMaxSize = fSendMaxSegmentSize
1910 			- tcp_options_length(segment);
1911 		uint32 segmentLength = min_c(length, segmentMaxSize);
1912 
1913 		if (fSendNext + segmentLength == fSendQueue.LastSequence()) {
1914 			if (state_needs_finish(fState))
1915 				segment.flags |= TCP_FLAG_FINISH;
1916 			if (length > 0)
1917 				segment.flags |= TCP_FLAG_PUSH;
1918 		}
1919 
1920 		// Determine if we should really send this segment
1921 		if (!force && !_ShouldSendSegment(segment, segmentLength,
1922 			segmentMaxSize, flightSize)) {
1923 			if (fSendQueue.Available()
1924 				&& !gStackModule->is_timer_active(&fPersistTimer)
1925 				&& !gStackModule->is_timer_active(&fRetransmitTimer))
1926 				_StartPersistTimer();
1927 			break;
1928 		}
1929 
1930 		net_buffer *buffer = gBufferModule->create(256);
1931 		if (buffer == NULL)
1932 			return B_NO_MEMORY;
1933 
1934 		status_t status = B_OK;
1935 		if (segmentLength > 0)
1936 			status = fSendQueue.Get(buffer, fSendNext, segmentLength);
1937 		if (status < B_OK) {
1938 			gBufferModule->free(buffer);
1939 			return status;
1940 		}
1941 
1942 		LocalAddress().CopyTo(buffer->source);
1943 		PeerAddress().CopyTo(buffer->destination);
1944 
1945 		uint32 size = buffer->size;
1946 		segment.sequence = fSendNext;
1947 
1948 		TRACE("SendQueued(): buffer %p (%lu bytes) address %s to %s\n"
1949 			"\tflags 0x%x, seq %lu, ack %lu, rwnd %hu, cwnd %lu, ssthresh %lu\n"
1950 			"\tlen %lu first %lu last %lu",
1951 			buffer, buffer->size, PrintAddress(buffer->source),
1952 			PrintAddress(buffer->destination), segment.flags, segment.sequence,
1953 			segment.acknowledge, segment.advertised_window,
1954 			fCongestionWindow, fSlowStartThreshold, segmentLength,
1955 			(uint32)fSendQueue.FirstSequence(),
1956 			(uint32)fSendQueue.LastSequence());
1957 		T(Send(this, segment, buffer, fSendQueue.FirstSequence(),
1958 			fSendQueue.LastSequence()));
1959 
1960 		PROBE(buffer, sendWindow);
1961 		sendWindow -= buffer->size;
1962 
1963 		status = add_tcp_header(AddressModule(), segment, buffer);
1964 		if (status != B_OK) {
1965 			gBufferModule->free(buffer);
1966 			return status;
1967 		}
1968 
1969 		// Update send status - we need to do this before we send the data
1970 		// for local connections as the answer is directly handled
1971 
1972 		if (segment.flags & TCP_FLAG_SYNCHRONIZE) {
1973 			segment.options &= ~TCP_HAS_WINDOW_SCALE;
1974 			segment.max_segment_size = 0;
1975 			size++;
1976 		}
1977 
1978 		if (segment.flags & TCP_FLAG_FINISH)
1979 			size++;
1980 
1981 		uint32 sendMax = fSendMax;
1982 		fSendNext += size;
1983 		if (fSendMax < fSendNext)
1984 			fSendMax = fSendNext;
1985 
1986 		fReceiveMaxAdvertised = fReceiveNext
1987 			+ ((uint32)segment.advertised_window << fReceiveWindowShift);
1988 
1989 		status = next->module->send_routed_data(next, fRoute, buffer);
1990 		if (status < B_OK) {
1991 			gBufferModule->free(buffer);
1992 
1993 			fSendNext = segment.sequence;
1994 			fSendMax = sendMax;
1995 				// restore send status
1996 			return status;
1997 		}
1998 
1999 		if (segment.flags & TCP_FLAG_ACKNOWLEDGE)
2000 			fLastAcknowledgeSent = segment.acknowledge;
2001 
2002 		length -= segmentLength;
2003 		segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET
2004 			| TCP_FLAG_FINISH);
2005 	} while (length > 0);
2006 
2007 	// if we sent data from the beggining of the send queue,
2008 	// start the retransmition timer
2009 	if (previousSendNext == fSendUnacknowledged
2010 		&& fSendNext > previousSendNext) {
2011 		TRACE("  SendQueue(): set retransmit timer with rto %llu",
2012 			fRetransmitTimeout);
2013 
2014 		gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout);
2015 	}
2016 
2017 	return B_OK;
2018 }
2019 
2020 
2021 int
2022 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const
2023 {
2024 	return next->module->get_mtu(next, address) - sizeof(tcp_header);
2025 }
2026 
2027 
2028 status_t
2029 TCPEndpoint::_PrepareSendPath(const sockaddr* peer)
2030 {
2031 	if (fRoute == NULL) {
2032 		fRoute = gDatalinkModule->get_route(Domain(), peer);
2033 		if (fRoute == NULL)
2034 			return ENETUNREACH;
2035 
2036 		if ((fRoute->flags & RTF_LOCAL) != 0)
2037 			fFlags |= FLAG_LOCAL;
2038 	}
2039 
2040 	// make sure connection does not already exist
2041 	status_t status = fManager->SetConnection(this, *LocalAddress(), peer,
2042 		fRoute->interface->address);
2043 	if (status < B_OK)
2044 		return status;
2045 
2046 	fInitialSendSequence = system_time() >> 4;
2047 	fSendNext = fInitialSendSequence;
2048 	fSendUnacknowledged = fInitialSendSequence;
2049 	fSendMax = fInitialSendSequence;
2050 	fSendUrgentOffset = fInitialSendSequence;
2051 
2052 	// we are counting the SYN here
2053 	fSendQueue.SetInitialSequence(fSendNext + 1);
2054 
2055 	fReceiveMaxSegmentSize = _MaxSegmentSize(peer);
2056 
2057 	// Compute the window shift we advertise to our peer - if it doesn't support
2058 	// this option, this will be reset to 0 (when its SYN is received)
2059 	fReceiveWindowShift = 0;
2060 	while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT
2061 		&& (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) {
2062 		fReceiveWindowShift++;
2063 	}
2064 
2065 	return B_OK;
2066 }
2067 
2068 
2069 void
2070 TCPEndpoint::_Acknowledged(tcp_segment_header& segment)
2071 {
2072 	size_t previouslyUsed = fSendQueue.Used();
2073 
2074 	fSendQueue.RemoveUntil(segment.acknowledge);
2075 	fSendUnacknowledged = segment.acknowledge;
2076 
2077 	if (fSendNext < fSendUnacknowledged)
2078 		fSendNext = fSendUnacknowledged;
2079 
2080 	if (fSendUnacknowledged == fSendMax)
2081 		gStackModule->cancel_timer(&fRetransmitTimer);
2082 
2083 	if (fSendQueue.Used() < previouslyUsed) {
2084 		// this ACK acknowledged data
2085 
2086 		if (segment.options & TCP_HAS_TIMESTAMPS)
2087 			_UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply));
2088 		else {
2089 			// TODO: Fallback to RFC 793 type estimation
2090 		}
2091 
2092 		if (is_writable(fState)) {
2093 			// notify threads waiting on the socket to become writable again
2094 			fSendList.Signal();
2095 			gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Used());
2096 		}
2097 
2098 		if (fCongestionWindow < fSlowStartThreshold)
2099 			fCongestionWindow += fSendMaxSegmentSize;
2100 	}
2101 
2102 	if (fCongestionWindow >= fSlowStartThreshold) {
2103 		uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize;
2104 
2105 		if (increment < fCongestionWindow)
2106 			increment = 1;
2107 		else
2108 			increment /= fCongestionWindow;
2109 
2110 		fCongestionWindow += increment;
2111 	}
2112 
2113 	// if there is data left to be send, send it now
2114 	if (fSendQueue.Used() > 0)
2115 		_SendQueued();
2116 }
2117 
2118 
2119 void
2120 TCPEndpoint::_Retransmit()
2121 {
2122 	TRACE("Retransmit()");
2123 	_ResetSlowStart();
2124 	fSendNext = fSendUnacknowledged;
2125 	_SendQueued();
2126 }
2127 
2128 
2129 void
2130 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime)
2131 {
2132 	int32 rtt = roundTripTime;
2133 
2134 	// "smooth" round trip time as per Van Jacobson
2135 	rtt -= fRoundTripTime / 8;
2136 	fRoundTripTime += rtt;
2137 	if (rtt < 0)
2138 		rtt = -rtt;
2139 	rtt -= fRoundTripDeviation / 4;
2140 	fRoundTripDeviation += rtt;
2141 
2142 	fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2)
2143 		* kTimestampFactor;
2144 
2145 	TRACE("  RTO is now %llu (after rtt %ldms)", fRetransmitTimeout,
2146 		roundTripTime);
2147 }
2148 
2149 
2150 void
2151 TCPEndpoint::_ResetSlowStart()
2152 {
2153 	fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged) / 2,
2154 		2 * fSendMaxSegmentSize);
2155 	fCongestionWindow = fSendMaxSegmentSize;
2156 }
2157 
2158 
2159 //	#pragma mark - timer
2160 
2161 
2162 /*static*/ void
2163 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint)
2164 {
2165 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2166 	T(Timer(endpoint, "retransmit"));
2167 
2168 	MutexLocker locker(endpoint->fLock);
2169 	if (!locker.IsLocked())
2170 		return;
2171 
2172 	endpoint->_Retransmit();
2173 }
2174 
2175 
2176 /*static*/ void
2177 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint)
2178 {
2179 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2180 	T(Timer(endpoint, "persist"));
2181 
2182 	MutexLocker locker(endpoint->fLock);
2183 	if (!locker.IsLocked())
2184 		return;
2185 
2186 	// the timer might not have been canceled early enough
2187 	if (endpoint->State() == CLOSED)
2188 		return;
2189 
2190 	endpoint->_SendQueued(true);
2191 }
2192 
2193 
2194 /*static*/ void
2195 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint)
2196 {
2197 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2198 	T(Timer(endpoint, "delayed ack"));
2199 
2200 	MutexLocker locker(endpoint->fLock);
2201 	if (!locker.IsLocked())
2202 		return;
2203 
2204 	// the timer might not have been canceled early enough
2205 	if (endpoint->State() == CLOSED)
2206 		return;
2207 
2208 	endpoint->SendAcknowledge(true);
2209 }
2210 
2211 
2212 /*static*/ void
2213 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint)
2214 {
2215 	TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint;
2216 	T(Timer(endpoint, "time-wait"));
2217 
2218 	MutexLocker locker(endpoint->fLock);
2219 	if (!locker.IsLocked())
2220 		return;
2221 
2222 	if ((endpoint->fFlags & FLAG_CLOSED) == 0) {
2223 		endpoint->fFlags |= FLAG_DELETE_ON_CLOSE;
2224 		return;
2225 	}
2226 
2227 	locker.Unlock();
2228 
2229 	gSocketModule->delete_socket(endpoint->socket);
2230 }
2231 
2232 
2233 //	#pragma mark -
2234 
2235 
2236 void
2237 TCPEndpoint::Dump() const
2238 {
2239 	kprintf("TCP endpoint %p\n", this);
2240 	kprintf("  state: %s\n", name_for_state(fState));
2241 	kprintf("  flags: 0x%lx\n", fFlags);
2242 #ifdef KDEBUG
2243 	kprintf("  lock: { %p, holder: %ld }\n", &fLock, fLock.holder);
2244 #endif
2245 	kprintf("  accept sem: %ld\n", fAcceptSemaphore);
2246 	kprintf("  options: 0x%lx\n", (uint32)fOptions);
2247 	kprintf("  send\n");
2248 	kprintf("    window shift: %lu\n", (uint32)fSendWindowShift);
2249 	kprintf("    unacknowledged: %lu\n", (uint32)fSendUnacknowledged);
2250 	kprintf("    next: %lu\n", (uint32)fSendNext);
2251 	kprintf("    max: %lu\n", (uint32)fSendMax);
2252 	kprintf("    urgent offset: %lu\n", (uint32)fSendUrgentOffset);
2253 	kprintf("    window: %lu\n", fSendWindow);
2254 	kprintf("    max window: %lu\n", fSendMaxWindow);
2255 	kprintf("    max segment size: %lu\n", fSendMaxSegmentSize);
2256 	kprintf("    queue: %lu / %lu\n", fSendQueue.Used(), fSendQueue.Size());
2257 	kprintf("    last acknowledge sent: %lu\n", (uint32)fLastAcknowledgeSent);
2258 	kprintf("    initial sequence: %lu\n", (uint32)fInitialSendSequence);
2259 	kprintf("  receive\n");
2260 	kprintf("    window shift: %lu\n", (uint32)fReceiveWindowShift);
2261 	kprintf("    next: %lu\n", (uint32)fReceiveNext);
2262 	kprintf("    max advertised: %lu\n", (uint32)fReceiveMaxAdvertised);
2263 	kprintf("    window: %lu\n", (uint32)fReceiveWindow);
2264 	kprintf("    max segment size: %lu\n", (uint32)fReceiveMaxSegmentSize);
2265 	kprintf("    queue: %lu / %lu\n", fReceiveQueue.Available(),
2266 		fReceiveQueue.Size());
2267 	kprintf("    initial sequence: %lu\n", (uint32)fInitialReceiveSequence);
2268 	kprintf("    duplicate acknowledge count: %lu\n",
2269 		fDuplicateAcknowledgeCount);
2270 	kprintf("  round trip time: %ld (deviation %ld)\n", fRoundTripTime,
2271 		fRoundTripDeviation);
2272 	kprintf("  retransmit timeout: %llu\n", (uint64)fRetransmitTimeout);
2273 	kprintf("  congestion window: %lu\n", fCongestionWindow);
2274 	kprintf("  slow start threshold: %lu\n", fSlowStartThreshold);
2275 }
2276 
2277